diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2016-11-25 13:45:48 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2016-11-25 13:45:48 +1100 |
commit | ff32b572d7a70550f121368c1d8935a97e2b9163 (patch) | |
tree | 059d62da1d43b4d142718c7b2317ee1383b3fe75 | |
parent | 6708e2500917f2fa8870f9da984940776a6078de (diff) | |
parent | f0ca634652747d0af6a359def2b077308910f0f0 (diff) |
Merge branch 'akpm/master'
56 files changed, 1375 insertions, 1283 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 960108560ab2..08c9e56c66ec 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -556,7 +556,7 @@ till "end_pgoff". ->map_pages() is called with page table locked and must not block. If it's not possible to reach a page without blocking, filesystem should skip it. Filesystem should use do_set_pte() to setup page table entry. Pointer to entry associated with the page is passed in -"pte" field in fault_env structure. Pointers to entries for other offsets +"pte" field in vm_fault structure. Pointers to entries for other offsets should be calculated relative to "pte". ->page_mkwrite() is called when a previously read-only pte is diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 06254467e4dd..3a147122bc98 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -236,7 +236,6 @@ static int spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct spu_context *ctx = vma->vm_file->private_data; - unsigned long address = (unsigned long)vmf->virtual_address; unsigned long pfn, offset; offset = vmf->pgoff << PAGE_SHIFT; @@ -244,7 +243,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n", - address, offset); + vmf->address, offset); if (spu_acquire(ctx)) return VM_FAULT_NOPAGE; @@ -256,7 +255,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT; } - vm_insert_pfn(vma, address, pfn); + vm_insert_pfn(vma, vmf->address, pfn); spu_release(ctx); @@ -355,8 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma, down_read(¤t->mm->mmap_sem); } else { area = ctx->spu->problem_phys + ps_offs; - vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, - (area + offset) >> PAGE_SHIFT); + vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu); } diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index a9973bb4a1b2..95e73c63c99d 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -42,7 +42,7 @@ static int panic_on_timeout; */ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ EXPORT_SYMBOL(nmi_active); - +static int nmi_init_done; static unsigned int nmi_hz = HZ; static DEFINE_PER_CPU(short, wd_enabled); static int endflag __initdata; @@ -153,6 +153,8 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count) void stop_nmi_watchdog(void *unused) { + if (!__this_cpu_read(wd_enabled)) + return; pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); __this_cpu_write(wd_enabled, 0); atomic_dec(&nmi_active); @@ -207,6 +209,9 @@ error: void start_nmi_watchdog(void *unused) { + if (__this_cpu_read(wd_enabled)) + return; + __this_cpu_write(wd_enabled, 1); atomic_inc(&nmi_active); @@ -259,6 +264,8 @@ int __init nmi_init(void) } } + nmi_init_done = 1; + return err; } @@ -270,3 +277,38 @@ static int __init setup_nmi_watchdog(char *str) return 0; } __setup("nmi_watchdog=", setup_nmi_watchdog); + +/* + * sparc specific NMI watchdog enable function. + * Enables watchdog if it is not enabled already. + */ +int watchdog_nmi_enable(unsigned int cpu) +{ + if (atomic_read(&nmi_active) == -1) { + pr_warn("NMI watchdog cannot be enabled or disabled\n"); + return -1; + } + + /* + * watchdog thread could start even before nmi_init is called. + * Just Return in that case. Let nmi_init finish the init + * process first. + */ + if (!nmi_init_done) + return 0; + + smp_call_function_single(cpu, start_nmi_watchdog, NULL, 1); + + return 0; +} +/* + * sparc specific NMI watchdog disable function. + * Disables watchdog if it is not disabled already. + */ +void watchdog_nmi_disable(unsigned int cpu) +{ + if (atomic_read(&nmi_active) == -1) + pr_warn_once("NMI watchdog cannot be enabled or disabled\n"); + else + smp_call_function_single(cpu, stop_nmi_watchdog, NULL, 1); +} diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index e739002427ed..40121d14d34d 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -109,7 +109,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, return VM_FAULT_SIGBUS; if (sym_offset == image->sym_vvar_page) { - ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, + ret = vm_insert_pfn(vma, vmf->address, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = @@ -117,7 +117,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { ret = vm_insert_pfn( vma, - (unsigned long)vmf->virtual_address, + vmf->address, __pa(pvti) >> PAGE_SHIFT); } } diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c index 199b8e99f7d7..737187865269 100644 --- a/drivers/char/agp/alpha-agp.c +++ b/drivers/char/agp/alpha-agp.c @@ -19,8 +19,7 @@ static int alpha_core_agp_vm_fault(struct vm_area_struct *vma, unsigned long pa; struct page *page; - dma_addr = (unsigned long)vmf->virtual_address - vma->vm_start - + agp->aperture.bus_base; + dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base; pa = agp->ops->translate(agp, dma_addr); if (pa == (unsigned long)-EINVAL) diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index f3f92d5fcda0..a697ca0cab1e 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * be because another thread has installed the pte first, so it * is no problem. */ - vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); + vm_insert_pfn(vma, vmf->address, pfn); return VM_FAULT_NOPAGE; } diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 3d94ff20fdca..028b56ab9e31 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -328,7 +328,6 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long vaddr = (unsigned long) vmf->virtual_address; struct device *dev = &dax_dev->dev; struct dax_region *dax_region; int rc = VM_FAULT_SIGBUS; @@ -353,7 +352,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - rc = vm_insert_mixed(vma, vaddr, pfn); + rc = vm_insert_mixed(vma, vmf->address, pfn); if (rc == -ENOMEM) return VM_FAULT_OOM; diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c index 768087ddb046..a293c8be232c 100644 --- a/drivers/gpu/drm/armada/armada_gem.c +++ b/drivers/gpu/drm/armada/armada_gem.c @@ -17,12 +17,11 @@ static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data); - unsigned long addr = (unsigned long)vmf->virtual_address; unsigned long pfn = obj->phys_addr >> PAGE_SHIFT; int ret; - pfn += (addr - vma->vm_start) >> PAGE_SHIFT; - ret = vm_insert_pfn(vma, addr, pfn); + pfn += (vmf->address - vma->vm_start) >> PAGE_SHIFT; + ret = vm_insert_pfn(vma, vmf->address, pfn); switch (ret) { case 0: diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index caa4e4ca616d..bd311c77c254 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -124,8 +124,7 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Using vm_pgoff as a selector forces us to use this unusual * addressing scheme. */ - resource_size_t offset = (unsigned long)vmf->virtual_address - - vma->vm_start; + resource_size_t offset = vmf->address - vma->vm_start; resource_size_t baddr = map->offset + offset; struct drm_agp_mem *agpmem; struct page *page; @@ -195,7 +194,7 @@ static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!map) return VM_FAULT_SIGBUS; /* Nothing allocated */ - offset = (unsigned long)vmf->virtual_address - vma->vm_start; + offset = vmf->address - vma->vm_start; i = (unsigned long)map->handle + offset; page = vmalloc_to_page((void *)i); if (!page) @@ -301,7 +300,8 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!dma->pagelist) return VM_FAULT_SIGBUS; /* Nothing allocated */ - offset = (unsigned long)vmf->virtual_address - vma->vm_start; /* vm_[pg]off[set] should be 0 */ + offset = vmf->address - vma->vm_start; + /* vm_[pg]off[set] should be 0 */ page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */ page = virt_to_page((void *)dma->pagelist[page_nr]); @@ -337,7 +337,7 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!entry->pagelist) return VM_FAULT_SIGBUS; /* Nothing allocated */ - offset = (unsigned long)vmf->virtual_address - vma->vm_start; + offset = vmf->address - vma->vm_start; map_offset = map->offset - (unsigned long)dev->sg->virtual; page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT); page = entry->pagelist[page_offset]; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 7d066a91d778..114dddbd297b 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -202,15 +202,14 @@ int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } /* We don't use vmf->pgoff since that has the fake offset: */ - pgoff = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; page = pages[pgoff]; - VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, + VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT); - ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); + ret = vm_insert_page(vma, vmf->address, page); out: switch (ret) { @@ -759,7 +758,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages( down_read(&mm->mmap_sem); while (pinned < npages) { ret = get_user_pages_remote(task, mm, ptr, npages - pinned, - flags, pvec + pinned, NULL); + flags, pvec + pinned, NULL, NULL); if (ret < 0) break; diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index f2ae72ba7d5a..5cfa5121e805 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -455,8 +455,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) pgoff_t page_offset; int ret; - page_offset = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) { DRM_ERROR("invalid page offset\n"); @@ -465,8 +464,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } pfn = page_to_pfn(exynos_gem->pages[page_offset]); - ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - __pfn_to_pfn_t(pfn, PFN_DEV)); + ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); out: switch (ret) { diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index 4071b2d1e8cf..8b44fa542562 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c @@ -125,7 +125,7 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) psbfb->gtt->offset; page_num = vma_pages(vma); - address = (unsigned long)vmf->virtual_address - (vmf->pgoff << PAGE_SHIFT); + address = vmf->address - (vmf->pgoff << PAGE_SHIFT); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c index 6d1cb6b370b1..527c62917660 100644 --- a/drivers/gpu/drm/gma500/gem.c +++ b/drivers/gpu/drm/gma500/gem.c @@ -197,15 +197,14 @@ int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* Page relative to the VMA start - we must calculate this ourselves because vmf->pgoff is the fake GEM offset */ - page_offset = ((unsigned long) vmf->virtual_address - vma->vm_start) - >> PAGE_SHIFT; + page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; /* CPU view of the page, don't go via the GART for CPU writes */ if (r->stolen) pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT; else pfn = page_to_pfn(r->pages[page_offset]); - ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); + ret = vm_insert_pfn(vma, vmf->address, pfn); fail: mutex_unlock(&dev_priv->mmap_mutex); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 902fa427c196..8f2d0eb66b81 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1796,8 +1796,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf) int ret; /* We don't use vmf->pgoff since that has the fake offset */ - page_offset = ((unsigned long)vmf->virtual_address - area->vm_start) >> - PAGE_SHIFT; + page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT; trace_i915_gem_object_fault(obj, page_offset, true, write); diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 107ddf51065e..d068af2ec3a3 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -515,7 +515,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) obj->userptr.ptr + pinned * PAGE_SIZE, npages - pinned, flags, - pvec + pinned, NULL); + pvec + pinned, NULL, NULL); if (ret < 0) break; diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 57db7dbbb618..a40be13b519e 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -225,16 +225,14 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } /* We don't use vmf->pgoff since that has the fake offset: */ - pgoff = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; pfn = page_to_pfn(pages[pgoff]); - VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, + VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, pfn, pfn << PAGE_SHIFT); - ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - __pfn_to_pfn_t(pfn, PFN_DEV)); + ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); out_unlock: mutex_unlock(&dev->struct_mutex); diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index d4e1e11466f8..4a90c690f09e 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c @@ -398,8 +398,7 @@ static int fault_1d(struct drm_gem_object *obj, pgoff_t pgoff; /* We don't use vmf->pgoff since that has the fake offset: */ - pgoff = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; if (omap_obj->pages) { omap_gem_cpu_sync(obj, pgoff); @@ -409,11 +408,10 @@ static int fault_1d(struct drm_gem_object *obj, pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff; } - VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, + VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, pfn, pfn << PAGE_SHIFT); - return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - __pfn_to_pfn_t(pfn, PFN_DEV)); + return vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); } /* Special handling for the case of faulting in 2d tiled buffers */ @@ -427,7 +425,7 @@ static int fault_2d(struct drm_gem_object *obj, struct page *pages[64]; /* XXX is this too much to have on stack? */ unsigned long pfn; pgoff_t pgoff, base_pgoff; - void __user *vaddr; + unsigned long vaddr; int i, ret, slots; /* @@ -447,8 +445,7 @@ static int fault_2d(struct drm_gem_object *obj, const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE); /* We don't use vmf->pgoff since that has the fake offset: */ - pgoff = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; /* * Actual address we start mapping at is rounded down to previous slot @@ -459,7 +456,7 @@ static int fault_2d(struct drm_gem_object *obj, /* figure out buffer width in slots */ slots = omap_obj->width >> priv->usergart[fmt].slot_shift; - vaddr = vmf->virtual_address - ((pgoff - base_pgoff) << PAGE_SHIFT); + vaddr = vmf->address - ((pgoff - base_pgoff) << PAGE_SHIFT); entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last]; @@ -503,12 +500,11 @@ static int fault_2d(struct drm_gem_object *obj, pfn = entry->paddr >> PAGE_SHIFT; - VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, + VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, pfn, pfn << PAGE_SHIFT); for (i = n; i > 0; i--) { - vm_insert_mixed(vma, (unsigned long)vaddr, - __pfn_to_pfn_t(pfn, PFN_DEV)); + vm_insert_mixed(vma, vaddr, __pfn_to_pfn_t(pfn, PFN_DEV)); pfn += priv->usergart[fmt].stride_pfn; vaddr += PAGE_SIZE * m; } diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index c08e5279eeac..7d853e6b5ff0 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c @@ -452,10 +452,10 @@ static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!bo->pages) return VM_FAULT_SIGBUS; - offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT; + offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; page = bo->pages[offset]; - err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); + err = vm_insert_page(vma, vmf->address, page); switch (err) { case -EAGAIN: case 0: diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 4748aedc933a..68ef993ab431 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -101,7 +101,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *page; int ret; int i; - unsigned long address = (unsigned long)vmf->virtual_address; + unsigned long address = vmf->address; int retval = VM_FAULT_NOPAGE; struct ttm_mem_type_manager *man = &bdev->man[bo->mem.mem_type]; diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c index 818e70712b18..3c0c4bd3f750 100644 --- a/drivers/gpu/drm/udl/udl_gem.c +++ b/drivers/gpu/drm/udl/udl_gem.c @@ -107,14 +107,13 @@ int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned int page_offset; int ret = 0; - page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> - PAGE_SHIFT; + page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; if (!obj->pages) return VM_FAULT_SIGBUS; page = obj->pages[page_offset]; - ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); + ret = vm_insert_page(vma, vmf->address, page); switch (ret) { case -EAGAIN: case 0: diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index f36c14729b55..477e07f0ecb6 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -54,7 +54,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct drm_vgem_gem_object *obj = vma->vm_private_data; /* We don't use vmf->pgoff since that has the fake offset */ - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; struct page *page; page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping, diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 1f0fe3217f23..6b079a31dced 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -578,7 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, */ npages = get_user_pages_remote(owning_process, owning_mm, user_virt, gup_num_pages, - flags, local_page_list, NULL); + flags, local_page_list, NULL, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 1db0af6c7f94..ba63ca57ed7e 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -439,13 +439,12 @@ static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *page; dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n", - (unsigned long)vmf->virtual_address, - vma->vm_start, vma->vm_end); + vmf->address, vma->vm_start, vma->vm_end); page = alloc_page(GFP_USER | __GFP_DMA32); if (!page) return VM_FAULT_OOM; - clear_user_highpage(page, (unsigned long)vmf->virtual_address); + clear_user_highpage(page, vmf->address); vmf->page = page; return 0; diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index ff5e7e8cb1d1..3907387b6d15 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -124,13 +124,12 @@ void cxl_context_set_mapping(struct cxl_context *ctx, static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct cxl_context *ctx = vma->vm_file->private_data; - unsigned long address = (unsigned long)vmf->virtual_address; u64 area, offset; offset = vmf->pgoff << PAGE_SHIFT; pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n", - __func__, ctx->pe, address, offset); + __func__, ctx->pe, vmf->address, offset); if (ctx->afu->current_mode == CXL_MODE_DEDICATED) { area = ctx->afu->psn_phys; @@ -162,7 +161,7 @@ static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } - vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT); + vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); mutex_unlock(&ctx->status_mutex); diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index 33741ad4a74a..af2e077da4b8 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c @@ -932,7 +932,7 @@ int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long paddr, vaddr; unsigned long expires; - vaddr = (unsigned long)vmf->virtual_address; + vaddr = vmf->address; gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n", vma, vaddr, GSEG_BASE(vaddr)); STAT(nopfn); diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c index e9cef9de9ed8..c96f9b1d948a 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c @@ -900,8 +900,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv, /* bound gain by 2 bits value max, 3rd bit is sign */ data->delta_gain_code[i] = - min(abs(delta_g), - (s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE); + min(abs(delta_g), CHAIN_NOISE_MAX_DELTA_GAIN_CODE); if (delta_g < 0) /* diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index d5cc3070e83f..b653451843c8 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -882,7 +882,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]); pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff])); - ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); + ret = vm_insert_pfn(vma, vmf->address, pfn); mutex_unlock(&buffer->lock); if (ret) return VM_FAULT_ERROR; diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c index 33b38bda8319..d102ba0b629c 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_io.c +++ b/drivers/staging/lustre/lustre/llite/vvp_io.c @@ -1018,7 +1018,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n", vmf->page, vmf->page->mapping, vmf->page->index, (long)vmf->page->flags, page_count(vmf->page), - page_private(vmf->page), vmf->virtual_address); + page_private(vmf->page), (void *)vmf->address); if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { lock_page(vmf->page); cfio->ft_flags |= VM_FAULT_LOCKED; @@ -1029,12 +1029,12 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) } if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { - CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address); + CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", (void *)vmf->address); return -EFAULT; } if (cfio->ft_flags & VM_FAULT_OOM) { - CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address); + CDEBUG(D_PAGE, "got addr %p - OOM\n", (void *)vmf->address); return -ENOMEM; } diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 51810a95416e..cae2e9121e48 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -362,7 +362,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, down_read(&mm->mmap_sem); ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, - NULL); + NULL, NULL); up_read(&mm->mmap_sem); } diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 702040fe2001..6e3306f4a525 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -602,7 +602,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", vma, vma->vm_start, vma->vm_end, - vmf->pgoff, vmf->virtual_address); + vmf->pgoff, (void *)vmf->address); return VM_FAULT_SIGBUS; } @@ -31,6 +31,7 @@ #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> +#include <linux/mmu_notifier.h> #include <linux/iomap.h> #include "internal.h" @@ -240,6 +241,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, } } +static void dax_unlock_mapping_entry(struct address_space *mapping, + pgoff_t index) +{ + void *entry, **slot; + + spin_lock_irq(&mapping->tree_lock); + entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || + !slot_locked(mapping, slot))) { + spin_unlock_irq(&mapping->tree_lock); + return; + } + unlock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, entry, false); +} + static void put_locked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { @@ -434,22 +452,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); } -void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - void *entry, **slot; - - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || - !slot_locked(mapping, slot))) { - spin_unlock_irq(&mapping->tree_lock); - return; - } - unlock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); - dax_wake_mapping_entry_waiter(mapping, index, entry, false); -} - /* * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree * entry to get unlocked before deleting it. @@ -501,10 +503,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry, /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) { - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + if (!page) return VM_FAULT_OOM; - } vmf->page = page; return VM_FAULT_LOCKED; } @@ -616,36 +616,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, return new_entry; } +static inline unsigned long +pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) +{ + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + return address; +} + +/* Walk all mappings of a given index of a file and writeprotect them */ +static void dax_mapping_entry_mkclean(struct address_space *mapping, + pgoff_t index, unsigned long pfn) +{ + struct vm_area_struct *vma; + pte_t *ptep; + pte_t pte; + spinlock_t *ptl; + bool changed; + + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { + unsigned long address; + + cond_resched(); + + if (!(vma->vm_flags & VM_SHARED)) + continue; + + address = pgoff_address(index, vma); + changed = false; + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + continue; + if (pfn != pte_pfn(*ptep)) + goto unlock; + if (!pte_dirty(*ptep) && !pte_write(*ptep)) + goto unlock; + + flush_cache_page(vma, address, pfn); + pte = ptep_clear_flush(vma, address, ptep); + pte = pte_wrprotect(pte); + pte = pte_mkclean(pte); + set_pte_at(vma->vm_mm, address, ptep, pte); + changed = true; +unlock: + pte_unmap_unlock(ptep, ptl); + + if (changed) + mmu_notifier_invalidate_page(vma->vm_mm, address); + } + i_mmap_unlock_read(mapping); +} + static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; - struct radix_tree_node *node; struct blk_dax_ctl dax; - void **slot; + void *entry2, **slot; int ret = 0; - spin_lock_irq(&mapping->tree_lock); /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. + * A page got tagged dirty in DAX mapping? Something is seriously + * wrong. */ - if (!__radix_tree_lookup(page_tree, index, &node, &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - - /* another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) - goto unlock; + if (WARN_ON(!radix_tree_exceptional_entry(entry))) + return -EIO; + spin_lock_irq(&mapping->tree_lock); + entry2 = get_unlocked_mapping_entry(mapping, index, &slot); + /* Entry got punched out / reallocated? */ + if (!entry2 || !radix_tree_exceptional_entry(entry2)) + goto put_unlocked; + /* + * Entry got reallocated elsewhere? No need to writeback. We have to + * compare sectors as we must not bail out due to difference in lockbit + * or entry type. + */ + if (dax_radix_sector(entry2) != dax_radix_sector(entry)) + goto put_unlocked; if (WARN_ON_ONCE(dax_is_empty_entry(entry) || dax_is_zero_entry(entry))) { ret = -EIO; - goto unlock; + goto put_unlocked; } + /* Another fsync thread may have already written back this entry */ + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto put_unlocked; + /* Lock the entry to serialize with page faults */ + entry = lock_slot(mapping, slot); + /* + * We can clear the tag now but we have to be careful so that concurrent + * dax_writeback_one() calls for the same index cannot finish before we + * actually flush the caches. This is achieved as the calls will look + * at the entry only under tree_lock and once they do that they will + * see the entry locked and wait for it to unlock. + */ + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + /* * Even if dax_writeback_mapping_range() was given a wbc->range_start * in the middle of a PMD, the 'index' we are given will be aligned to @@ -655,31 +726,40 @@ static int dax_writeback_one(struct block_device *bdev, */ dax.sector = dax_radix_sector(entry); dax.size = PAGE_SIZE << dax_radix_order(entry); - spin_unlock_irq(&mapping->tree_lock); /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); - if (ret < 0) + if (ret < 0) { + put_locked_mapping_entry(mapping, index, entry); return ret; + } if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } + dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); wb_cache_pmem(dax.addr, dax.size); - + /* + * After we have flushed the cache, we can clear the dirty tag. There + * cannot be new dirty data in the pfn after the flush has completed as + * the pfn mappings are writeprotected and fault waits for mapping + * entry lock. + */ spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); + put_locked_mapping_entry(mapping, index, entry); return ret; - unlock: + put_unlocked: + put_unlocked_mapping_entry(mapping, index, entry2); spin_unlock_irq(&mapping->tree_lock); return ret; } @@ -739,7 +819,7 @@ static int dax_insert_mapping(struct address_space *mapping, struct block_device *bdev, sector_t sector, size_t size, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; struct blk_dax_ctl dax = { .sector = sector, .size = size, @@ -768,17 +848,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; - void *entry; + void *entry, **slot; pgoff_t index = vmf->pgoff; spin_lock_irq(&mapping->tree_lock); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || !radix_tree_exceptional_entry(entry)) - goto out; + entry = get_unlocked_mapping_entry(mapping, index, &slot); + if (!entry || !radix_tree_exceptional_entry(entry)) { + if (entry) + put_unlocked_mapping_entry(mapping, index, entry); + spin_unlock_irq(&mapping->tree_lock); + return VM_FAULT_NOPAGE; + } radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); - put_unlocked_mapping_entry(mapping, index, entry); -out: + entry = lock_slot(mapping, slot); spin_unlock_irq(&mapping->tree_lock); + /* + * If we race with somebody updating the PTE and finish_mkwrite_fault() + * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry + * the fault in either case. + */ + finish_mkwrite_fault(vmf); + put_locked_mapping_entry(mapping, index, entry); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -949,13 +1039,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; sector_t sector; struct iomap iomap = { 0 }; unsigned flags = IOMAP_FAULT; int error, major = 0; - int locked_status = 0; + int vmf_ret = 0; void *entry; /* @@ -1008,13 +1098,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (error) goto finish_iomap; - if (!radix_tree_exceptional_entry(entry)) { - vmf->page = entry; - locked_status = VM_FAULT_LOCKED; - } else { - vmf->entry = entry; - locked_status = VM_FAULT_DAX_LOCKED; - } + + __SetPageUptodate(vmf->cow_page); + vmf_ret = finish_fault(vmf); + if (!vmf_ret) + vmf_ret = VM_FAULT_DONE_COW; goto finish_iomap; } @@ -1031,7 +1119,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { - locked_status = dax_load_hole(mapping, entry, vmf); + vmf_ret = dax_load_hole(mapping, entry, vmf); break; } /*FALLTHRU*/ @@ -1043,7 +1131,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, finish_iomap: if (ops->iomap_end) { - if (error) { + if (error || (vmf_ret & VM_FAULT_ERROR)) { /* keep previous error */ ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, &iomap); @@ -1053,7 +1141,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } } unlock_entry: - if (!locked_status || error) + if (vmf_ret != VM_FAULT_LOCKED || error) put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) @@ -1061,9 +1149,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error < 0 && error != -EBUSY) return VM_FAULT_SIGBUS | major; - if (locked_status) { + if (vmf_ret) { WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ - return locked_status; + return vmf_ret; } return VM_FAULT_NOPAGE | major; } diff --git a/fs/exec.c b/fs/exec.c index 88b5e1efdbd6..8112eacf10f3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * doing the exec and bprm->mm is the new process's mm. */ ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, - &page, NULL); + &page, NULL, NULL); if (ret <= 0) return NULL; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 85959d8324df..d96e2f30084b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -257,9 +257,9 @@ out: * fatal_signal_pending()s, and the mmap_sem must be released before * returning it. */ -int handle_userfault(struct fault_env *fe, unsigned long reason) +int handle_userfault(struct vm_fault *vmf, unsigned long reason) { - struct mm_struct *mm = fe->vma->vm_mm; + struct mm_struct *mm = vmf->vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; int ret; @@ -268,7 +268,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); ret = VM_FAULT_SIGBUS; - ctx = fe->vma->vm_userfaultfd_ctx.ctx; + ctx = vmf->vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -301,17 +301,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) * without first stopping userland access to the memory. For * VM_UFFD_MISSING userfaults this is enough for now. */ - if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { /* * Validate the invariant that nowait must allow retry * to be sure not to return SIGBUS erroneously on * nowait invocations. */ - BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); + BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { printk(KERN_WARNING - "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); + "FAULT_FLAG_ALLOW_RETRY missing %x\n", + vmf->flags); dump_stack(); } #endif @@ -323,7 +324,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) * and wait. */ ret = VM_FAULT_RETRY; - if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; /* take the reference before dropping the mmap_sem */ @@ -331,11 +332,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(fe->address, fe->flags, reason); + uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); uwq.ctx = ctx; return_to_userland = - (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == + (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); spin_lock(&ctx->fault_pending_wqh.lock); @@ -353,7 +354,8 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) TASK_KILLABLE); spin_unlock(&ctx->fault_pending_wqh.lock); - must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); + must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, + reason); up_read(&mm->mmap_sem); if (likely(must_wait && !ACCESS_ONCE(ctx->released) && diff --git a/include/linux/dax.h b/include/linux/dax.h index 0afade8bd3d7..f97bcfe79472 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -46,7 +46,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, #ifdef CONFIG_FS_DAX struct page *read_dax_sector(struct block_device *bdev, sector_t n); -void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index); int __dax_zero_page_range(struct block_device *bdev, sector_t sector, unsigned int offset, unsigned int length); #else @@ -55,12 +54,6 @@ static inline struct page *read_dax_sector(struct block_device *bdev, { return ERR_PTR(-ENXIO); } -/* Shouldn't ever be called when dax is disabled. */ -static inline void dax_unlock_mapping_entry(struct address_space *mapping, - pgoff_t index) -{ - BUG(); -} static inline int __dax_zero_page_range(struct block_device *bdev, sector_t sector, unsigned int offset, unsigned int length) { diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1f782aa1d8e6..97e478d6b690 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -1,12 +1,12 @@ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H -extern int do_huge_pmd_anonymous_page(struct fault_env *fe); +extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf); extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); -extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd); -extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd); +extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); +extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, @@ -142,7 +142,7 @@ static inline int hpage_nr_pages(struct page *page) return 1; } -extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd); +extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *huge_zero_page; @@ -212,7 +212,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, return NULL; } -static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd) +static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) { return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b5b2e4df14e..4424784ac374 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -292,36 +292,23 @@ extern pgprot_t protection_map[16]; * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { + struct vm_area_struct *vma; /* Target VMA */ unsigned int flags; /* FAULT_FLAG_xxx flags */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ - void __user *virtual_address; /* Faulting virtual address */ + unsigned long address; /* Faulting virtual address */ + pmd_t *pmd; /* Pointer to pmd entry matching + * the 'address' */ + pte_t orig_pte; /* Value of PTE at the time of fault */ - struct page *cow_page; /* Handler may choose to COW */ + struct page *cow_page; /* Page handler may use for COW fault */ + struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR). */ - void *entry; /* ->fault handler can alternatively - * return locked DAX entry. In that - * case handler should return - * VM_FAULT_DAX_LOCKED and fill in - * entry here. - */ -}; - -/* - * Page fault context: passes though page fault handler instead of endless list - * of function arguments. - */ -struct fault_env { - struct vm_area_struct *vma; /* Target VMA */ - unsigned long address; /* Faulting virtual address */ - unsigned int flags; /* FAULT_FLAG_xxx flags */ - pmd_t *pmd; /* Pointer to pmd entry matching - * the 'address' - */ + /* These three entries are valid only while holding ptl lock */ pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated. @@ -351,7 +338,7 @@ struct vm_operations_struct { int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); int (*pmd_fault)(struct vm_area_struct *, unsigned long address, pmd_t *, unsigned int flags); - void (*map_pages)(struct fault_env *fe, + void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); /* notification that a previously read-only page is about to become @@ -625,8 +612,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, +int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page); +int finish_fault(struct vm_fault *vmf); +int finish_mkwrite_fault(struct vm_fault *vmf); #endif /* @@ -1110,7 +1099,7 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ -#define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */ +#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ @@ -1221,6 +1210,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); +int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, + spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, @@ -1276,15 +1267,12 @@ extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); + struct vm_area_struct **vmas, int *locked); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); -long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); int get_user_pages_fast(unsigned long start, int nr_pages, int write, @@ -2099,7 +2087,7 @@ extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); -extern void filemap_map_pages(struct fault_env *fe, +extern void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a78c35cff1ae..aacca824a6ae 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -7,6 +7,23 @@ #include <linux/sched.h> #include <asm/irq.h> +/* + * The run state of the lockup detectors is controlled by the content of the + * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - + * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. + * + * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' + * are variables that are only used as an 'interface' between the parameters + * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The + * 'watchdog_thresh' variable is handled differently because its value is not + * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' + * is equal zero. + */ +#define NMI_WATCHDOG_ENABLED_BIT 0 +#define SOFT_WATCHDOG_ENABLED_BIT 1 +#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) +#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) + /** * touch_nmi_watchdog - restart NMI watchdog timeout. * @@ -91,9 +108,16 @@ extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; +extern unsigned long watchdog_enabled; extern unsigned long *watchdog_cpumask_bits; +#ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; +#else +#define sysctl_softlockup_all_cpu_backtrace 0 +#define sysctl_hardlockup_all_cpu_backtrace 0 +#endif +extern bool is_hardlockup(void); struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index dd66a952e8cd..11b92b047a1e 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -27,7 +27,7 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) -extern int handle_userfault(struct fault_env *fe, unsigned long reason); +extern int handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len); @@ -55,7 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) #else /* CONFIG_USERFAULTFD */ /* mm helpers */ -static inline int handle_userfault(struct fault_env *fe, unsigned long reason) +static inline int handle_userfault(struct vm_fault *vmf, unsigned long reason) { return VM_FAULT_SIGBUS; } diff --git a/ipc/sem.c b/ipc/sem.c index e84703d5024e..31eaa87ecba4 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -11,6 +11,7 @@ * (c) 2001 Red Hat Inc * Lockless wakeup * (c) 2003 Manfred Spraul <manfred@colorfullife.com> + * (c) 2016 Davidlohr Bueso <dave@stgolabs.net> * Further wakeup optimizations, documentation * (c) 2010 Manfred Spraul <manfred@colorfullife.com> * @@ -53,15 +54,11 @@ * Semaphores are actively given to waiting tasks (necessary for FIFO). * (see update_queue()) * - To improve the scalability, the actual wake-up calls are performed after - * dropping all locks. (see wake_up_sem_queue_prepare(), - * wake_up_sem_queue_do()) + * dropping all locks. (see wake_up_sem_queue_prepare()) * - All work is done by the waker, the woken up task does not have to do * anything - not even acquiring a lock or dropping a refcount. * - A woken up task may not even touch the semaphore array anymore, it may * have been destroyed already by a semctl(RMID). - * - The synchronizations between wake-ups due to a timeout/signal and a - * wake-up due to a completed semaphore operation is achieved by using an - * intermediate state (IN_WAKEUP). * - UNDO values are stored in an array (one per process and per * semaphore array, lazily allocated). For backwards compatibility, multiple * modes for the UNDO variables are supported (per process, per thread) @@ -118,7 +115,8 @@ struct sem_queue { struct sembuf *sops; /* array of pending operations */ struct sembuf *blocking; /* the operation that blocked */ int nsops; /* number of operations */ - int alter; /* does *sops alter the array? */ + bool alter; /* does *sops alter the array? */ + bool dupsop; /* sops on more than one sem_num */ }; /* Each task has a list of undo requests. They are executed automatically @@ -433,29 +431,6 @@ static inline void sem_unlock(struct sem_array *sma, int locknum) * * The caller holds the RCU read lock. */ -static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, - int id, struct sembuf *sops, int nsops, int *locknum) -{ - struct kern_ipc_perm *ipcp; - struct sem_array *sma; - - ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); - if (IS_ERR(ipcp)) - return ERR_CAST(ipcp); - - sma = container_of(ipcp, struct sem_array, sem_perm); - *locknum = sem_lock(sma, sops, nsops); - - /* ipc_rmid() may have already freed the ID while sem_lock - * was spinning: verify that the structure is still valid - */ - if (ipc_valid_object(ipcp)) - return container_of(ipcp, struct sem_array, sem_perm); - - sem_unlock(sma, *locknum); - return ERR_PTR(-EINVAL); -} - static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); @@ -488,40 +463,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) ipc_rmid(&sem_ids(ns), &s->sem_perm); } -/* - * Lockless wakeup algorithm: - * Without the check/retry algorithm a lockless wakeup is possible: - * - queue.status is initialized to -EINTR before blocking. - * - wakeup is performed by - * * unlinking the queue entry from the pending list - * * setting queue.status to IN_WAKEUP - * This is the notification for the blocked thread that a - * result value is imminent. - * * call wake_up_process - * * set queue.status to the final value. - * - the previously blocked thread checks queue.status: - * * if it's IN_WAKEUP, then it must wait until the value changes - * * if it's not -EINTR, then the operation was completed by - * update_queue. semtimedop can return queue.status without - * performing any operation on the sem array. - * * otherwise it must acquire the spinlock and check what's up. - * - * The two-stage algorithm is necessary to protect against the following - * races: - * - if queue.status is set after wake_up_process, then the woken up idle - * thread could race forward and try (and fail) to acquire sma->lock - * before update_queue had a chance to set queue.status - * - if queue.status is written before wake_up_process and if the - * blocked process is woken up by a signal between writing - * queue.status and the wake_up_process, then the woken up - * process could return from semtimedop and die by calling - * sys_exit before wake_up_process is called. Then wake_up_process - * will oops, because the task structure is already invalid. - * (yes, this happened on s390 with sysv msg). - * - */ -#define IN_WAKEUP 1 - /** * newary - Create a new semaphore set * @ns: namespace @@ -641,15 +582,23 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) } /** - * perform_atomic_semop - Perform (if possible) a semaphore operation + * perform_atomic_semop[_slow] - Attempt to perform semaphore + * operations on a given array. * @sma: semaphore array * @q: struct sem_queue that describes the operation * + * Caller blocking are as follows, based the value + * indicated by the semaphore operation (sem_op): + * + * (1) >0 never blocks. + * (2) 0 (wait-for-zero operation): semval is non-zero. + * (3) <0 attempting to decrement semval to a value smaller than zero. + * * Returns 0 if the operation was possible. * Returns 1 if the operation is impossible, the caller must sleep. - * Negative values are error codes. + * Returns <0 for error codes. */ -static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) +static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q) { int result, sem_op, nsops, pid; struct sembuf *sop; @@ -720,51 +669,84 @@ undo: return result; } -/** wake_up_sem_queue_prepare(q, error): Prepare wake-up - * @q: queue entry that must be signaled - * @error: Error value for the signal - * - * Prepare the wake-up of the queue entry q. - */ -static void wake_up_sem_queue_prepare(struct list_head *pt, - struct sem_queue *q, int error) +static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) { - if (list_empty(pt)) { - /* - * Hold preempt off so that we don't get preempted and have the - * wakee busy-wait until we're scheduled back on. - */ - preempt_disable(); + int result, sem_op, nsops; + struct sembuf *sop; + struct sem *curr; + struct sembuf *sops; + struct sem_undo *un; + + sops = q->sops; + nsops = q->nsops; + un = q->undo; + + if (unlikely(q->dupsop)) + return perform_atomic_semop_slow(sma, q); + + /* + * We scan the semaphore set twice, first to ensure that the entire + * operation can succeed, therefore avoiding any pointless writes + * to shared memory and having to undo such changes in order to block + * until the operations can go through. + */ + for (sop = sops; sop < sops + nsops; sop++) { + curr = sma->sem_base + sop->sem_num; + sem_op = sop->sem_op; + result = curr->semval; + + if (!sem_op && result) + goto would_block; /* wait-for-zero */ + + result += sem_op; + if (result < 0) + goto would_block; + + if (result > SEMVMX) + return -ERANGE; + + if (sop->sem_flg & SEM_UNDO) { + int undo = un->semadj[sop->sem_num] - sem_op; + + /* Exceeding the undo range is an error. */ + if (undo < (-SEMAEM - 1) || undo > SEMAEM) + return -ERANGE; + } + } + + for (sop = sops; sop < sops + nsops; sop++) { + curr = sma->sem_base + sop->sem_num; + sem_op = sop->sem_op; + result = curr->semval; + + if (sop->sem_flg & SEM_UNDO) { + int undo = un->semadj[sop->sem_num] - sem_op; + + un->semadj[sop->sem_num] = undo; + } + curr->semval += sem_op; + curr->sempid = q->pid; } - q->status = IN_WAKEUP; - q->pid = error; - list_add_tail(&q->list, pt); + return 0; + +would_block: + q->blocking = sop; + return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1; } -/** - * wake_up_sem_queue_do - do the actual wake-up - * @pt: list of tasks to be woken up - * - * Do the actual wake-up. - * The function is called without any locks held, thus the semaphore array - * could be destroyed already and the tasks can disappear as soon as the - * status is set to the actual return code. - */ -static void wake_up_sem_queue_do(struct list_head *pt) +static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error, + struct wake_q_head *wake_q) { - struct sem_queue *q, *t; - int did_something; - - did_something = !list_empty(pt); - list_for_each_entry_safe(q, t, pt, list) { - wake_up_process(q->sleeper); - /* q can disappear immediately after writing q->status. */ - smp_wmb(); - q->status = q->pid; - } - if (did_something) - preempt_enable(); + wake_q_add(wake_q, q->sleeper); + /* + * Rely on the above implicit barrier, such that we can + * ensure that we hold reference to the task before setting + * q->status. Otherwise we could race with do_exit if the + * task is awoken by an external event before calling + * wake_up_process(). + */ + WRITE_ONCE(q->status, error); } static void unlink_queue(struct sem_array *sma, struct sem_queue *q) @@ -784,7 +766,7 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q) * modified the array. * Note that wait-for-zero operations are handled without restart. */ -static int check_restart(struct sem_array *sma, struct sem_queue *q) +static inline int check_restart(struct sem_array *sma, struct sem_queue *q) { /* pending complex alter operations are too difficult to analyse */ if (!list_empty(&sma->pending_alter)) @@ -812,21 +794,20 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q) * wake_const_ops - wake up non-alter tasks * @sma: semaphore array. * @semnum: semaphore that was modified. - * @pt: list head for the tasks that must be woken up. + * @wake_q: lockless wake-queue head. * * wake_const_ops must be called after a semaphore in a semaphore array * was set to 0. If complex const operations are pending, wake_const_ops must * be called with semnum = -1, as well as with the number of each modified * semaphore. - * The tasks that must be woken up are added to @pt. The return code + * The tasks that must be woken up are added to @wake_q. The return code * is stored in q->pid. * The function returns 1 if at least one operation was completed successfully. */ static int wake_const_ops(struct sem_array *sma, int semnum, - struct list_head *pt) + struct wake_q_head *wake_q) { - struct sem_queue *q; - struct list_head *walk; + struct sem_queue *q, *tmp; struct list_head *pending_list; int semop_completed = 0; @@ -835,25 +816,19 @@ static int wake_const_ops(struct sem_array *sma, int semnum, else pending_list = &sma->sem_base[semnum].pending_const; - walk = pending_list->next; - while (walk != pending_list) { - int error; - - q = container_of(walk, struct sem_queue, list); - walk = walk->next; - - error = perform_atomic_semop(sma, q); - - if (error <= 0) { - /* operation completed, remove from queue & wakeup */ + list_for_each_entry_safe(q, tmp, pending_list, list) { + int error = perform_atomic_semop(sma, q); - unlink_queue(sma, q); + if (error > 0) + continue; + /* operation completed, remove from queue & wakeup */ + unlink_queue(sma, q); - wake_up_sem_queue_prepare(pt, q, error); - if (error == 0) - semop_completed = 1; - } + wake_up_sem_queue_prepare(q, error, wake_q); + if (error == 0) + semop_completed = 1; } + return semop_completed; } @@ -862,14 +837,14 @@ static int wake_const_ops(struct sem_array *sma, int semnum, * @sma: semaphore array * @sops: operations that were performed * @nsops: number of operations - * @pt: list head of the tasks that must be woken up. + * @wake_q: lockless wake-queue head * * Checks all required queue for wait-for-zero operations, based * on the actual changes that were performed on the semaphore array. * The function returns 1 if at least one operation was completed successfully. */ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, - int nsops, struct list_head *pt) + int nsops, struct wake_q_head *wake_q) { int i; int semop_completed = 0; @@ -882,7 +857,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, if (sma->sem_base[num].semval == 0) { got_zero = 1; - semop_completed |= wake_const_ops(sma, num, pt); + semop_completed |= wake_const_ops(sma, num, wake_q); } } } else { @@ -893,7 +868,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, for (i = 0; i < sma->sem_nsems; i++) { if (sma->sem_base[i].semval == 0) { got_zero = 1; - semop_completed |= wake_const_ops(sma, i, pt); + semop_completed |= wake_const_ops(sma, i, wake_q); } } } @@ -902,7 +877,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, * then check the global queue, too. */ if (got_zero) - semop_completed |= wake_const_ops(sma, -1, pt); + semop_completed |= wake_const_ops(sma, -1, wake_q); return semop_completed; } @@ -912,22 +887,21 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, * update_queue - look for tasks that can be completed. * @sma: semaphore array. * @semnum: semaphore that was modified. - * @pt: list head for the tasks that must be woken up. + * @wake_q: lockless wake-queue head. * * update_queue must be called after a semaphore in a semaphore array * was modified. If multiple semaphores were modified, update_queue must * be called with semnum = -1, as well as with the number of each modified * semaphore. - * The tasks that must be woken up are added to @pt. The return code + * The tasks that must be woken up are added to @wake_q. The return code * is stored in q->pid. * The function internally checks if const operations can now succeed. * * The function return 1 if at least one semop was completed successfully. */ -static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) +static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q) { - struct sem_queue *q; - struct list_head *walk; + struct sem_queue *q, *tmp; struct list_head *pending_list; int semop_completed = 0; @@ -937,13 +911,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) pending_list = &sma->sem_base[semnum].pending_alter; again: - walk = pending_list->next; - while (walk != pending_list) { + list_for_each_entry_safe(q, tmp, pending_list, list) { int error, restart; - q = container_of(walk, struct sem_queue, list); - walk = walk->next; - /* If we are scanning the single sop, per-semaphore list of * one semaphore and that semaphore is 0, then it is not * necessary to scan further: simple increments @@ -966,11 +936,11 @@ again: restart = 0; } else { semop_completed = 1; - do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); + do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q); restart = check_restart(sma, q); } - wake_up_sem_queue_prepare(pt, q, error); + wake_up_sem_queue_prepare(q, error, wake_q); if (restart) goto again; } @@ -1001,24 +971,24 @@ static void set_semotime(struct sem_array *sma, struct sembuf *sops) * @sops: operations that were performed * @nsops: number of operations * @otime: force setting otime - * @pt: list head of the tasks that must be woken up. + * @wake_q: lockless wake-queue head * * do_smart_update() does the required calls to update_queue and wakeup_zero, * based on the actual changes that were performed on the semaphore array. * Note that the function does not do the actual wake-up: the caller is - * responsible for calling wake_up_sem_queue_do(@pt). + * responsible for calling wake_up_q(). * It is safe to perform this call after dropping all locks. */ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, - int otime, struct list_head *pt) + int otime, struct wake_q_head *wake_q) { int i; - otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); + otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q); if (!list_empty(&sma->pending_alter)) { /* semaphore array uses the global queue - just process it. */ - otime |= update_queue(sma, -1, pt); + otime |= update_queue(sma, -1, wake_q); } else { if (!sops) { /* @@ -1026,7 +996,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop * known. Check all. */ for (i = 0; i < sma->sem_nsems; i++) - otime |= update_queue(sma, i, pt); + otime |= update_queue(sma, i, wake_q); } else { /* * Check the semaphores that were increased: @@ -1040,7 +1010,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop for (i = 0; i < nsops; i++) { if (sops[i].sem_op > 0) { otime |= update_queue(sma, - sops[i].sem_num, pt); + sops[i].sem_num, wake_q); } } } @@ -1128,8 +1098,8 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) struct sem_undo *un, *tu; struct sem_queue *q, *tq; struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); - struct list_head tasks; int i; + DEFINE_WAKE_Q(wake_q); /* Free the existing undo structures for this semaphore set. */ ipc_assert_locked_object(&sma->sem_perm); @@ -1143,25 +1113,24 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) } /* Wake up all pending processes and let them fail with EIDRM. */ - INIT_LIST_HEAD(&tasks); list_for_each_entry_safe(q, tq, &sma->pending_const, list) { unlink_queue(sma, q); - wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { unlink_queue(sma, q); - wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } for (i = 0; i < sma->sem_nsems; i++) { struct sem *sem = sma->sem_base + i; list_for_each_entry_safe(q, tq, &sem->pending_const, list) { unlink_queue(sma, q); - wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { unlink_queue(sma, q); - wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } } @@ -1170,7 +1139,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) sem_unlock(sma, -1); rcu_read_unlock(); - wake_up_sem_queue_do(&tasks); + wake_up_q(&wake_q); ns->used_sems -= sma->sem_nsems; ipc_rcu_putref(sma, sem_rcu_free); } @@ -1309,9 +1278,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, struct sem_undo *un; struct sem_array *sma; struct sem *curr; - int err; - struct list_head tasks; - int val; + int err, val; + DEFINE_WAKE_Q(wake_q); + #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) /* big-endian 64bit */ val = arg >> 32; @@ -1323,8 +1292,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, if (val > SEMVMX || val < 0) return -ERANGE; - INIT_LIST_HEAD(&tasks); - rcu_read_lock(); sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { @@ -1367,10 +1334,10 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, curr->sempid = task_tgid_vnr(current); sma->sem_ctime = get_seconds(); /* maybe some queued-up processes were waiting for this */ - do_smart_update(sma, NULL, 0, 0, &tasks); + do_smart_update(sma, NULL, 0, 0, &wake_q); sem_unlock(sma, -1); rcu_read_unlock(); - wake_up_sem_queue_do(&tasks); + wake_up_q(&wake_q); return 0; } @@ -1382,9 +1349,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int err, nsems; ushort fast_sem_io[SEMMSL_FAST]; ushort *sem_io = fast_sem_io; - struct list_head tasks; - - INIT_LIST_HEAD(&tasks); + DEFINE_WAKE_Q(wake_q); rcu_read_lock(); sma = sem_obtain_object_check(ns, semid); @@ -1495,7 +1460,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, } sma->sem_ctime = get_seconds(); /* maybe some queued-up processes were waiting for this */ - do_smart_update(sma, NULL, 0, 0, &tasks); + do_smart_update(sma, NULL, 0, 0, &wake_q); err = 0; goto out_unlock; } @@ -1531,7 +1496,7 @@ out_unlock: sem_unlock(sma, -1); out_rcu_wakeup: rcu_read_unlock(); - wake_up_sem_queue_do(&tasks); + wake_up_q(&wake_q); out_free: if (sem_io != fast_sem_io) ipc_free(sem_io); @@ -1804,32 +1769,6 @@ out: return un; } - -/** - * get_queue_result - retrieve the result code from sem_queue - * @q: Pointer to queue structure - * - * Retrieve the return code from the pending queue. If IN_WAKEUP is found in - * q->status, then we must loop until the value is replaced with the final - * value: This may happen if a task is woken up by an unrelated event (e.g. - * signal) and in parallel the task is woken up by another task because it got - * the requested semaphores. - * - * The function can be called with or without holding the semaphore spinlock. - */ -static int get_queue_result(struct sem_queue *q) -{ - int error; - - error = q->status; - while (unlikely(error == IN_WAKEUP)) { - cpu_relax(); - error = q->status; - } - - return error; -} - SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, unsigned, nsops, const struct timespec __user *, timeout) { @@ -1838,11 +1777,11 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, struct sembuf fast_sops[SEMOPM_FAST]; struct sembuf *sops = fast_sops, *sop; struct sem_undo *un; - int undos = 0, alter = 0, max, locknum; + int max, locknum; + bool undos = false, alter = false, dupsop = false; struct sem_queue queue; - unsigned long jiffies_left = 0; + unsigned long dup = 0, jiffies_left = 0; struct ipc_namespace *ns; - struct list_head tasks; ns = current->nsproxy->ipc_ns; @@ -1855,10 +1794,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, if (sops == NULL) return -ENOMEM; } + if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { error = -EFAULT; goto out_free; } + if (timeout) { struct timespec _timeout; if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) { @@ -1872,18 +1813,30 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, } jiffies_left = timespec_to_jiffies(&_timeout); } + max = 0; for (sop = sops; sop < sops + nsops; sop++) { + unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG); + if (sop->sem_num >= max) max = sop->sem_num; if (sop->sem_flg & SEM_UNDO) - undos = 1; - if (sop->sem_op != 0) - alter = 1; + undos = true; + if (dup & mask) { + /* + * There was a previous alter access that appears + * to have accessed the same semaphore, thus use + * the dupsop logic. "appears", because the detection + * can only check % BITS_PER_LONG. + */ + dupsop = true; + } + if (sop->sem_op != 0) { + alter = true; + dup |= mask; + } } - INIT_LIST_HEAD(&tasks); - if (undos) { /* On success, find_alloc_undo takes the rcu_read_lock */ un = find_alloc_undo(ns, semid); @@ -1904,16 +1857,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, } error = -EFBIG; - if (max >= sma->sem_nsems) - goto out_rcu_wakeup; + if (max >= sma->sem_nsems) { + rcu_read_unlock(); + goto out_free; + } error = -EACCES; - if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) - goto out_rcu_wakeup; + if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) { + rcu_read_unlock(); + goto out_free; + } error = security_sem_semop(sma, sops, nsops, alter); - if (error) - goto out_rcu_wakeup; + if (error) { + rcu_read_unlock(); + goto out_free; + } error = -EIDRM; locknum = sem_lock(sma, sops, nsops); @@ -1942,24 +1901,34 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, queue.undo = un; queue.pid = task_tgid_vnr(current); queue.alter = alter; + queue.dupsop = dupsop; error = perform_atomic_semop(sma, &queue); - if (error == 0) { - /* If the operation was successful, then do + if (error == 0) { /* non-blocking succesfull path */ + DEFINE_WAKE_Q(wake_q); + + /* + * If the operation was successful, then do * the required updates. */ if (alter) - do_smart_update(sma, sops, nsops, 1, &tasks); + do_smart_update(sma, sops, nsops, 1, &wake_q); else set_semotime(sma, sops); + + sem_unlock(sma, locknum); + rcu_read_unlock(); + wake_up_q(&wake_q); + + goto out_free; } - if (error <= 0) + if (error < 0) /* non-blocking error path */ goto out_unlock_free; - /* We need to sleep on this operation, so we put the current + /* + * We need to sleep on this operation, so we put the current * task into the pending queue and go to sleep. */ - if (nsops == 1) { struct sem *curr; curr = &sma->sem_base[sops->sem_num]; @@ -1988,77 +1957,69 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, sma->complex_count++; } - queue.status = -EINTR; - queue.sleeper = current; + do { + queue.status = -EINTR; + queue.sleeper = current; -sleep_again: - __set_current_state(TASK_INTERRUPTIBLE); - sem_unlock(sma, locknum); - rcu_read_unlock(); - - if (timeout) - jiffies_left = schedule_timeout(jiffies_left); - else - schedule(); + __set_current_state(TASK_INTERRUPTIBLE); + sem_unlock(sma, locknum); + rcu_read_unlock(); - error = get_queue_result(&queue); + if (timeout) + jiffies_left = schedule_timeout(jiffies_left); + else + schedule(); - if (error != -EINTR) { - /* fast path: update_queue already obtained all requested - * resources. - * Perform a smp_mb(): User space could assume that semop() - * is a memory barrier: Without the mb(), the cpu could - * speculatively read in user space stale data that was - * overwritten by the previous owner of the semaphore. + /* + * fastpath: the semop has completed, either successfully or + * not, from the syscall pov, is quite irrelevant to us at this + * point; we're done. + * + * We _do_ care, nonetheless, about being awoken by a signal or + * spuriously. The queue.status is checked again in the + * slowpath (aka after taking sem_lock), such that we can detect + * scenarios where we were awakened externally, during the + * window between wake_q_add() and wake_up_q(). */ - smp_mb(); - - goto out_free; - } - - rcu_read_lock(); - sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum); - - /* - * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing. - */ - error = get_queue_result(&queue); + error = READ_ONCE(queue.status); + if (error != -EINTR) { + /* + * User space could assume that semop() is a memory + * barrier: Without the mb(), the cpu could + * speculatively read in userspace stale data that was + * overwritten by the previous owner of the semaphore. + */ + smp_mb(); + goto out_free; + } - /* - * Array removed? If yes, leave without sem_unlock(). - */ - if (IS_ERR(sma)) { - rcu_read_unlock(); - goto out_free; - } + rcu_read_lock(); + sem_lock(sma, sops, nsops); + if (!ipc_valid_object(&sma->sem_perm)) + goto out_unlock_free; - /* - * If queue.status != -EINTR we are woken up by another process. - * Leave without unlink_queue(), but with sem_unlock(). - */ - if (error != -EINTR) - goto out_unlock_free; + error = READ_ONCE(queue.status); - /* - * If an interrupt occurred we have to clean up the queue - */ - if (timeout && jiffies_left == 0) - error = -EAGAIN; + /* + * If queue.status != -EINTR we are woken up by another process. + * Leave without unlink_queue(), but with sem_unlock(). + */ + if (error != -EINTR) + goto out_unlock_free; - /* - * If the wakeup was spurious, just retry - */ - if (error == -EINTR && !signal_pending(current)) - goto sleep_again; + /* + * If an interrupt occurred we have to clean up the queue. + */ + if (timeout && jiffies_left == 0) + error = -EAGAIN; + } while (error == -EINTR && !signal_pending(current)); /* spurious */ unlink_queue(sma, &queue); out_unlock_free: sem_unlock(sma, locknum); -out_rcu_wakeup: rcu_read_unlock(); - wake_up_sem_queue_do(&tasks); out_free: if (sops != fast_sops) kfree(sops); @@ -2119,8 +2080,8 @@ void exit_sem(struct task_struct *tsk) for (;;) { struct sem_array *sma; struct sem_undo *un; - struct list_head tasks; int semid, i; + DEFINE_WAKE_Q(wake_q); cond_resched(); @@ -2208,11 +2169,10 @@ void exit_sem(struct task_struct *tsk) } } /* maybe some queued-up processes were waiting for this */ - INIT_LIST_HEAD(&tasks); - do_smart_update(sma, NULL, 0, 1, &tasks); + do_smart_update(sma, NULL, 0, 1, &wake_q); sem_unlock(sma, -1); rcu_read_unlock(); - wake_up_sem_queue_do(&tasks); + wake_up_q(&wake_q); kfree_rcu(un, rcu); } diff --git a/kernel/Makefile b/kernel/Makefile index eb26e12c6c2a..314e7d62f5f0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f9ec9add2164..215871bda3a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, - &vma); + &vma, NULL); if (ret <= 0) return ret; @@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * essentially a kernel access to the memory. */ result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, - NULL); + NULL, NULL); if (result < 0) return result; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index cd6716e115e8..3c8bb6571128 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -121,3 +121,8 @@ long clock_nanosleep_restart(struct restart_block *restart_block) return hrtimer_nanosleep_restart(restart_block); } #endif + +int posix_timer_event(struct k_itimer *timr, int si_private) +{ + return 0; +} diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6d1020c03d41..d4b0fa01cae3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,32 +24,14 @@ #include <asm/irq_regs.h> #include <linux/kvm_para.h> -#include <linux/perf_event.h> #include <linux/kthread.h> -/* - * The run state of the lockup detectors is controlled by the content of the - * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - - * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. - * - * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' - * are variables that are only used as an 'interface' between the parameters - * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The - * 'watchdog_thresh' variable is handled differently because its value is not - * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' - * is equal zero. - */ -#define NMI_WATCHDOG_ENABLED_BIT 0 -#define SOFT_WATCHDOG_ENABLED_BIT 1 -#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) -#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) - static DEFINE_MUTEX(watchdog_proc_mutex); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; #else -static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif int __read_mostly nmi_watchdog_enabled; int __read_mostly soft_watchdog_enabled; @@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10; #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#else -#define sysctl_softlockup_all_cpu_backtrace 0 -#define sysctl_hardlockup_all_cpu_backtrace 0 #endif static struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -100,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static DEFINE_PER_CPU(bool, hard_watchdog_warn); -static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); -static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -#endif static unsigned long soft_lockup_nmi_warn; -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -#ifdef CONFIG_HARDLOCKUP_DETECTOR -unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; -static unsigned long hardlockup_allcpu_dumped; -/* - * We may not want to enable hard lockup detection by default in all cases, - * for example when running the kernel as a guest on a hypervisor. In these - * cases this function can be called to disable hard lockup detection. This - * function should only be executed once by the boot processor before the - * kernel command line parameters are parsed, because otherwise it is not - * possible to override this in hardlockup_panic_setup(). - */ -void hardlockup_detector_disable(void) -{ - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; -} - -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; - else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); -#endif - unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -264,32 +202,14 @@ void touch_all_softlockup_watchdogs(void) wq_watchdog_touch(-1); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR -void touch_nmi_watchdog(void) -{ - /* - * Using __raw here because some code paths have - * preemption enabled. If preemption is enabled - * then interrupts should be enabled too, in which - * case we shouldn't have to worry about the watchdog - * going off. - */ - raw_cpu_write(watchdog_nmi_touch, true); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -#endif - void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); __this_cpu_write(watchdog_touch_ts, 0); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ -static bool is_hardlockup(void) +bool is_hardlockup(void) { unsigned long hrint = __this_cpu_read(hrtimer_interrupts); @@ -299,7 +219,6 @@ static bool is_hardlockup(void) __this_cpu_write(hrtimer_interrupts_saved, hrint); return false; } -#endif static int is_softlockup(unsigned long touch_ts) { @@ -313,77 +232,22 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -#ifdef CONFIG_HARDLOCKUP_DETECTOR - -static struct perf_event_attr wd_hw_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, -}; - -/* Callback function for perf event subsystem */ -static void watchdog_overflow_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - /* Ensure the watchdog never gets throttled */ - event->hw.interrupts = 0; - - if (__this_cpu_read(watchdog_nmi_touch) == true) { - __this_cpu_write(watchdog_nmi_touch, false); - return; - } - - /* check for a hardlockup - * This is done by making sure our timer interrupt - * is incrementing. The timer interrupt should have - * fired multiple times before we overflow'd. If it hasn't - * then this is a good indication the cpu is stuck - */ - if (is_hardlockup()) { - int this_cpu = smp_processor_id(); - - /* only print hardlockups once */ - if (__this_cpu_read(hard_watchdog_warn) == true) - return; - - pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - - /* - * Perform all-CPU dump only once to avoid multiple hardlockups - * generating interleaving traces - */ - if (sysctl_hardlockup_all_cpu_backtrace && - !test_and_set_bit(0, &hardlockup_allcpu_dumped)) - trigger_allbutself_cpu_backtrace(); - - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); - - __this_cpu_write(hard_watchdog_warn, true); - return; - } - - __this_cpu_write(hard_watchdog_warn, false); - return; -} -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ - static void watchdog_interrupt_count(void) { __this_cpu_inc(hrtimer_interrupts); } -static int watchdog_nmi_enable(unsigned int cpu); -static void watchdog_nmi_disable(unsigned int cpu); +/* + * These two functions are mostly architecture specific + * defining them as weak here. + */ +int __weak watchdog_nmi_enable(unsigned int cpu) +{ + return 0; +} +void __weak watchdog_nmi_disable(unsigned int cpu) +{ +} static int watchdog_enable_all_cpus(void); static void watchdog_disable_all_cpus(void); @@ -576,109 +440,6 @@ static void watchdog(unsigned int cpu) watchdog_nmi_disable(cpu); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR -/* - * People like the simple clean cpu node info on boot. - * Reduce the watchdog noise by only printing messages - * that are different from what cpu0 displayed. - */ -static unsigned long cpu0_err; - -static int watchdog_nmi_enable(unsigned int cpu) -{ - struct perf_event_attr *wd_attr; - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - /* nothing to do if the hard lockup detector is disabled */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto out; - - /* is it already setup and enabled? */ - if (event && event->state > PERF_EVENT_STATE_OFF) - goto out; - - /* it is setup but not enabled */ - if (event != NULL) - goto out_enable; - - wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - - /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - - /* save cpu0 error for future comparision */ - if (cpu == 0 && IS_ERR(event)) - cpu0_err = PTR_ERR(event); - - if (!IS_ERR(event)) { - /* only print for cpu0 or different than cpu0 */ - if (cpu == 0 || cpu0_err) - pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); - goto out_save; - } - - /* - * Disable the hard lockup detector if _any_ CPU fails to set up - * set up the hardware perf event. The watchdog() function checks - * the NMI_WATCHDOG_ENABLED bit periodically. - * - * The barriers are for syncing up watchdog_enabled across all the - * cpus, as clear_bit() does not use barriers. - */ - smp_mb__before_atomic(); - clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); - smp_mb__after_atomic(); - - /* skip displaying the same error again */ - if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) - return PTR_ERR(event); - - /* vary the KERN level based on the returned errno */ - if (PTR_ERR(event) == -EOPNOTSUPP) - pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); - else if (PTR_ERR(event) == -ENOENT) - pr_warn("disabled (cpu%i): hardware events not enabled\n", - cpu); - else - pr_err("disabled (cpu%i): unable to create perf event: %ld\n", - cpu, PTR_ERR(event)); - - pr_info("Shutting down hard lockup detector on all cpus\n"); - - return PTR_ERR(event); - - /* success path */ -out_save: - per_cpu(watchdog_ev, cpu) = event; -out_enable: - perf_event_enable(per_cpu(watchdog_ev, cpu)); -out: - return 0; -} - -static void watchdog_nmi_disable(unsigned int cpu) -{ - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - if (event) { - perf_event_disable(event); - per_cpu(watchdog_ev, cpu) = NULL; - - /* should be in cleanup, but blocks oprofile */ - perf_event_release_kernel(event); - } - if (cpu == 0) { - /* watchdog_nmi_enable() expects this to be zero initially. */ - cpu0_err = 0; - } -} - -#else -static int watchdog_nmi_enable(unsigned int cpu) { return 0; } -static void watchdog_nmi_disable(unsigned int cpu) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ - static struct smp_hotplug_thread watchdog_threads = { .store = &softlockup_watchdog, .thread_should_run = watchdog_should_run, diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c new file mode 100644 index 000000000000..84016c8aee6b --- /dev/null +++ b/kernel/watchdog_hld.c @@ -0,0 +1,227 @@ +/* + * Detect hard lockups on a system + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Note: Most of this code is borrowed heavily from the original softlockup + * detector, so thanks to Ingo for the initial implementation. + * Some chunks also taken from the old x86-specific nmi watchdog code, thanks + * to those contributors as well. + */ + +#define pr_fmt(fmt) "NMI watchdog: " fmt + +#include <linux/nmi.h> +#include <linux/module.h> +#include <asm/irq_regs.h> +#include <linux/perf_event.h> + +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); +static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); + +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +unsigned int __read_mostly hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +static unsigned long hardlockup_allcpu_dumped; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); + +void touch_nmi_watchdog(void) +{ + /* + * Using __raw here because some code paths have + * preemption enabled. If preemption is enabled + * then interrupts should be enabled too, in which + * case we shouldn't have to worry about the watchdog + * going off. + */ + raw_cpu_write(watchdog_nmi_touch, true); + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +static struct perf_event_attr wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +/* Callback function for perf event subsystem */ +static void watchdog_overflow_callback(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + /* Ensure the watchdog never gets throttled */ + event->hw.interrupts = 0; + + if (__this_cpu_read(watchdog_nmi_touch) == true) { + __this_cpu_write(watchdog_nmi_touch, false); + return; + } + + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup()) { + int this_cpu = smp_processor_id(); + + /* only print hardlockups once */ + if (__this_cpu_read(hard_watchdog_warn) == true) + return; + + pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + /* + * Perform all-CPU dump only once to avoid multiple hardlockups + * generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); + + __this_cpu_write(hard_watchdog_warn, true); + return; + } + + __this_cpu_write(hard_watchdog_warn, false); + return; +} + +/* + * People like the simple clean cpu node info on boot. + * Reduce the watchdog noise by only printing messages + * that are different from what cpu0 displayed. + */ +static unsigned long cpu0_err; + +int watchdog_nmi_enable(unsigned int cpu) +{ + struct perf_event_attr *wd_attr; + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* nothing to do if the hard lockup detector is disabled */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + goto out; + + /* is it already setup and enabled? */ + if (event && event->state > PERF_EVENT_STATE_OFF) + goto out; + + /* it is setup but not enabled */ + if (event != NULL) + goto out_enable; + + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + + /* Try to register using hardware perf events */ + event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + + /* save cpu0 error for future comparision */ + if (cpu == 0 && IS_ERR(event)) + cpu0_err = PTR_ERR(event); + + if (!IS_ERR(event)) { + /* only print for cpu0 or different than cpu0 */ + if (cpu == 0 || cpu0_err) + pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); + goto out_save; + } + + /* + * Disable the hard lockup detector if _any_ CPU fails to set up + * set up the hardware perf event. The watchdog() function checks + * the NMI_WATCHDOG_ENABLED bit periodically. + * + * The barriers are for syncing up watchdog_enabled across all the + * cpus, as clear_bit() does not use barriers. + */ + smp_mb__before_atomic(); + clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); + smp_mb__after_atomic(); + + /* skip displaying the same error again */ + if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + return PTR_ERR(event); + + /* vary the KERN level based on the returned errno */ + if (PTR_ERR(event) == -EOPNOTSUPP) + pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); + else if (PTR_ERR(event) == -ENOENT) + pr_warn("disabled (cpu%i): hardware events not enabled\n", + cpu); + else + pr_err("disabled (cpu%i): unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); + + pr_info("Shutting down hard lockup detector on all cpus\n"); + + return PTR_ERR(event); + + /* success path */ +out_save: + per_cpu(watchdog_ev, cpu) = event; +out_enable: + perf_event_enable(per_cpu(watchdog_ev, cpu)); +out: + return 0; +} + +void watchdog_nmi_disable(unsigned int cpu) +{ + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) { + perf_event_disable(event); + per_cpu(watchdog_ev, cpu) = NULL; + + /* should be in cleanup, but blocks oprofile */ + perf_event_release_kernel(event); + } + if (cpu == 0) { + /* watchdog_nmi_enable() expects this to be zero initially. */ + cpu0_err = 0; + } +} diff --git a/mm/filemap.c b/mm/filemap.c index 69568388c699..235021e361eb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2164,12 +2164,12 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct radix_tree_iter iter; void **slot; - struct file *file = fe->vma->vm_file; + struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; loff_t size; @@ -2225,11 +2225,11 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; - if (fe->pte) - fe->pte += iter.index - last_pgoff; + vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + if (vmf->pte) + vmf->pte += iter.index - last_pgoff; last_pgoff = iter.index; - if (alloc_set_pte(fe, NULL, page)) + if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); goto next; @@ -2239,7 +2239,7 @@ skip: put_page(page); next: /* Huge page is mapped? No need to proceed. */ - if (pmd_trans_huge(*fe->pmd)) + if (pmd_trans_huge(*vmf->pmd)) break; if (iter.index == end_pgoff) break; @@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked); * caller if required (just like with __get_user_pages). "FOLL_GET" * is set implicitly if "pages" is non-NULL. */ -__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; int locked = 1; @@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); /* * get_user_pages_unlocked() is suitable to replace the form: @@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages @@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, int *locked) { return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - NULL, false, + locked, true, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote); /* * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's. We also - * obviously don't pass FOLL_REMOTE in here. + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e96c670fe8b7..0957e654b3c9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -532,13 +532,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, +static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; pgtable_t pgtable; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -563,9 +563,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, */ __SetPageUptodate(page); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); @@ -576,11 +576,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, if (userfaultfd_missing(vma)) { int ret; - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); - ret = handle_userfault(fe, VM_UFFD_MISSING); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } @@ -590,11 +590,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&vma->vm_mm->nr_ptes); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -641,12 +641,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct fault_env *fe) +int do_huge_pmd_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; gfp_t gfp; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return VM_FAULT_FALLBACK; @@ -654,7 +654,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; @@ -670,22 +670,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); ret = 0; set = false; - if (pmd_none(*fe->pmd)) { + if (pmd_none(*vmf->pmd)) { if (userfaultfd_missing(vma)) { - spin_unlock(fe->ptl); - ret = handle_userfault(fe, VM_UFFD_MISSING); + spin_unlock(vmf->ptl); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { set_huge_zero_page(pgtable, vma->vm_mm, vma, - haddr, fe->pmd, zero_page); - spin_unlock(fe->ptl); + haddr, vmf->pmd, zero_page); + spin_unlock(vmf->ptl); set = true; } } else - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); if (!set) pte_free(vma->vm_mm, pgtable); return ret; @@ -697,7 +697,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_FALLBACK; } prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(fe, page, gfp); + return __do_huge_pmd_anonymous_page(vmf, page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -869,30 +869,30 @@ out: return ret; } -void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) +void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) { pmd_t entry; unsigned long haddr; - fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto unlock; entry = pmd_mkyoung(orig_pmd); - haddr = fe->address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, - fe->flags & FAULT_FLAG_WRITE)) - update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); + haddr = vmf->address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, + vmf->flags & FAULT_FLAG_WRITE)) + update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); } -static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, +static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, struct page *page) { - struct vm_area_struct *vma = fe->vma; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; pgtable_t pgtable; pmd_t _pmd; @@ -911,7 +911,7 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, for (i = 0; i < HPAGE_PMD_NR; i++) { pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | __GFP_OTHER_NODE, vma, - fe->address, page_to_nid(page)); + vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], vma->vm_mm, GFP_KERNEL, &memcg, false))) { @@ -942,15 +942,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); + pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); pmd_populate(vma->vm_mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -959,20 +959,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); + page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); - fe->pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*fe->pte)); - set_pte_at(vma->vm_mm, haddr, fe->pte, entry); - pte_unmap(fe->pte); + vmf->pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*vmf->pte)); + set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); + pte_unmap(vmf->pte); } kfree(pages); smp_wmb(); /* make pte visible before pmd */ - pmd_populate(vma->vm_mm, fe->pmd, pgtable); + pmd_populate(vma->vm_mm, vmf->pmd, pgtable); page_remove_rmap(page, true); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); @@ -983,7 +983,7 @@ out: return ret; out_free_pages: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); @@ -995,23 +995,23 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) +int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *new_page; struct mem_cgroup *memcg; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ int ret = 0; - fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(fe->ptl); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); @@ -1024,13 +1024,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { @@ -1043,12 +1043,12 @@ alloc: prep_transhuge_page(new_page); } else { if (!page) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } else { - ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); + ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } put_page(page); @@ -1060,7 +1060,7 @@ alloc: if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, huge_gfp, &memcg, true))) { put_page(new_page); - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) put_page(page); ret |= VM_FAULT_FALLBACK; @@ -1080,11 +1080,11 @@ alloc: mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - spin_lock(fe->ptl); + spin_lock(vmf->ptl); if (page) put_page(page); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { - spin_unlock(fe->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; @@ -1092,12 +1092,12 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); page_add_new_anon_rmap(new_page, vma, haddr, true); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); } else { @@ -1107,13 +1107,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out_mn: mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); return ret; } @@ -1186,12 +1186,12 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) +int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct anon_vma *anon_vma = NULL; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; @@ -1199,8 +1199,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) bool was_writable; int flags = 0; - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(pmd, *fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) goto out_unlock; /* @@ -1208,9 +1208,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * without disrupting NUMA hinting information. Do not relock and * check_same as the page may no longer be mapped. */ - if (unlikely(pmd_trans_migrating(*fe->pmd))) { - page = pmd_page(*fe->pmd); - spin_unlock(fe->ptl); + if (unlikely(pmd_trans_migrating(*vmf->pmd))) { + page = pmd_page(*vmf->pmd); + spin_unlock(vmf->ptl); wait_on_page_locked(page); goto out; } @@ -1243,7 +1243,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1254,12 +1254,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * to serialises splits */ get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(fe->ptl); - if (unlikely(!pmd_same(pmd, *fe->pmd))) { + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) { unlock_page(page); put_page(page); page_nid = -1; @@ -1277,9 +1277,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, - fe->pmd, pmd, fe->address, page, target_nid); + vmf->pmd, pmd, vmf->address, page, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; @@ -1294,18 +1294,19 @@ clear_pmdnuma: pmd = pmd_mkyoung(pmd); if (was_writable) pmd = pmd_mkwrite(pmd); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); unlock_page(page); out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, + vmf->flags); return 0; } diff --git a/mm/internal.h b/mm/internal.h index 537ac9951f5f..44d68895a9b9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -36,7 +36,7 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) -int do_swap_page(struct fault_env *fe, pte_t orig_pte); +int do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 09460955e818..7434a63cac94 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -875,13 +875,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int referenced) { - pte_t pteval; int swapped_in = 0, ret = 0; - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, .address = address, .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, + .pgoff = linear_page_index(vma, address), }; /* we only decide to swapin, if there is enough young ptes */ @@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } - fe.pte = pte_offset_map(pmd, address); - for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; - fe.pte++, fe.address += PAGE_SIZE) { - pteval = *fe.pte; - if (!is_swap_pte(pteval)) + vmf.pte = pte_offset_map(pmd, address); + for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; + vmf.pte++, vmf.address += PAGE_SIZE) { + vmf.orig_pte = *vmf.pte; + if (!is_swap_pte(vmf.orig_pte)) continue; swapped_in++; - ret = do_swap_page(&fe, pteval); + ret = do_swap_page(&vmf); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); - if (hugepage_vma_revalidate(mm, address, &fe.vma)) { + if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return false; } /* pte is unmapped now, we need to map it */ - fe.pte = pte_offset_map(pmd, fe.address); + vmf.pte = pte_offset_map(pmd, vmf.address); } - fe.pte--; - pte_unmap(fe.pte); + vmf.pte--; + pte_unmap(vmf.pte); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); return true; } diff --git a/mm/memory.c b/mm/memory.c index 08d8da39de28..455c3e628d52 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) * * We do this without the lock held, so that it can sleep if it needs to. */ -static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, - unsigned long address) +static int do_page_mkwrite(struct vm_fault *vmf) { - struct vm_fault vmf; int ret; + struct page *page = vmf->page; + unsigned int old_flags = vmf->flags; - vmf.virtual_address = (void __user *)(address & PAGE_MASK); - vmf.pgoff = page->index; - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.page = page; - vmf.cow_page = NULL; + vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - ret = vma->vm_ops->page_mkwrite(vma, &vmf); + ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf); + /* Restore original flags so that caller is not surprised */ + vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { @@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, } /* + * Handle dirtying of a page in shared file mapping on a write fault. + * + * The function expects the page to be locked and unlocks it. + */ +static void fault_dirty_shared_page(struct vm_area_struct *vma, + struct page *page) +{ + struct address_space *mapping; + bool dirtied; + bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; + + dirtied = set_page_dirty(page); + VM_BUG_ON_PAGE(PageAnon(page), page); + /* + * Take a local copy of the address_space - page.mapping may be zeroed + * by truncate after unlock_page(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * release semantics to prevent the compiler from undoing this copying. + */ + mapping = page_rmapping(page); + unlock_page(page); + + if ((dirtied || page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + if (!page_mkwrite) + file_update_time(vma->vm_file); +} + +/* * Handle write page faults for pages that can be reused in the current vma * * This can happen either due to the mapping being with the VM_SHARED flag, @@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, - struct page *page, int page_mkwrite, int dirty_shared) - __releases(fe->ptl) +static inline void wp_page_reuse(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; + struct page *page = vmf->page; pte_t entry; /* * Clear the pages cpupid information as the existing @@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); + entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) - update_mmu_cache(vma, fe->address, fe->pte); - pte_unmap_unlock(fe->pte, fe->ptl); - - if (dirty_shared) { - struct address_space *mapping; - int dirtied; - - if (!page_mkwrite) - lock_page(page); - - dirtied = set_page_dirty(page); - VM_BUG_ON_PAGE(PageAnon(page), page); - mapping = page->mapping; - unlock_page(page); - put_page(page); - - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!page_mkwrite) - file_update_time(vma->vm_file); - } - - return VM_FAULT_WRITE; + if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) + update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); } /* @@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, - struct page *old_page) +static int wp_page_copy(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; + struct page *old_page = vmf->page; struct page *new_page = NULL; pte_t entry; int page_copied = 0; - const unsigned long mmun_start = fe->address & PAGE_MASK; + const unsigned long mmun_start = vmf->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; if (unlikely(anon_vma_prepare(vma))) goto oom; - if (is_zero_pfn(pte_pfn(orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); + if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { + new_page = alloc_zeroed_user_highpage_movable(vma, + vmf->address); if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - fe->address); + vmf->address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, fe->address, vma); + cow_user_page(new_page, old_page, vmf->address, vma); } if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) @@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, /* * Re-check the pte - we dropped the lock */ - fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) { + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, @@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, fe->address, fe->pte); - page_add_new_anon_rmap(new_page, vma, fe->address, false); + ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + page_add_new_anon_rmap(new_page, vma, vmf->address, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* @@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ - set_pte_at_notify(mm, fe->address, fe->pte, entry); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + update_mmu_cache(vma, vmf->address, vmf->pte); if (old_page) { /* * Only after switching the pte to the new page may @@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, if (new_page) put_page(new_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* @@ -2263,79 +2269,91 @@ oom: return VM_FAULT_OOM; } +/** + * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE + * writeable once the page is prepared + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a write page fault in a + * shared mapping due to PTE being read-only once the mapped page is prepared. + * It handles locking of PTE and modifying it. The function returns + * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE + * lock. + * + * The function expects the page to be locked or other protection against + * concurrent faults / writeback (such as DAX radix tree locks). + */ +int finish_mkwrite_fault(struct vm_fault *vmf) +{ + WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + /* + * We might have raced with another page fault while we released the + * pte_offset_map_lock. + */ + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return VM_FAULT_NOPAGE; + } + wp_page_reuse(vmf); + return 0; +} + /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) +static int wp_pfn_shared(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { - struct vm_fault vmf = { - .page = NULL, - .pgoff = linear_page_index(vma, fe->address), - .virtual_address = - (void __user *)(fe->address & PAGE_MASK), - .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, - }; int ret; - pte_unmap_unlock(fe->pte, fe->ptl); - ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); - if (ret & VM_FAULT_ERROR) + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->flags |= FAULT_FLAG_MKWRITE; + ret = vma->vm_ops->pfn_mkwrite(vma, vmf); + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - /* - * We might have raced with another page fault while we - * released the pte_offset_map_lock. - */ - if (!pte_same(*fe->pte, orig_pte)) { - pte_unmap_unlock(fe->pte, fe->ptl); - return 0; - } + return finish_mkwrite_fault(vmf); } - return wp_page_reuse(fe, orig_pte, NULL, 0, 0); + wp_page_reuse(vmf); + return VM_FAULT_WRITE; } -static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, - struct page *old_page) - __releases(fe->ptl) +static int wp_page_shared(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; - int page_mkwrite = 0; + struct vm_area_struct *vma = vmf->vma; - get_page(old_page); + get_page(vmf->page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; - pte_unmap_unlock(fe->pte, fe->ptl); - tmp = do_page_mkwrite(vma, old_page, fe->address); + pte_unmap_unlock(vmf->pte, vmf->ptl); + tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(old_page); + put_page(vmf->page); return tmp; } - /* - * Since we dropped the lock we need to revalidate - * the PTE as someone else may have changed it. If - * they did, we just return, as we can count on the - * MMU to tell us if they didn't also make it writable. - */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { - unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - put_page(old_page); - return 0; + tmp = finish_mkwrite_fault(vmf); + if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + unlock_page(vmf->page); + put_page(vmf->page); + return tmp; } - page_mkwrite = 1; + } else { + wp_page_reuse(vmf); + lock_page(vmf->page); } + fault_dirty_shared_page(vma, vmf->page); + put_page(vmf->page); - return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); + return VM_FAULT_WRITE; } /* @@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct fault_env *fe, pte_t orig_pte) - __releases(fe->ptl) +static int do_wp_page(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; - struct page *old_page; + struct vm_area_struct *vma = vmf->vma; - old_page = vm_normal_page(vma, fe->address, orig_pte); - if (!old_page) { + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); + if (!vmf->page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. @@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(fe, orig_pte); + return wp_pfn_shared(vmf); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf); } /* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(old_page) && !PageKsm(old_page)) { + if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { int total_mapcount; - if (!trylock_page(old_page)) { - get_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - lock_page(old_page); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { - unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - put_page(old_page); + if (!trylock_page(vmf->page)) { + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + lock_page(vmf->page); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + unlock_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + put_page(vmf->page); return 0; } - put_page(old_page); + put_page(vmf->page); } - if (reuse_swap_page(old_page, &total_mapcount)) { + if (reuse_swap_page(vmf->page, &total_mapcount)) { if (total_mapcount == 1) { /* * The page is all ours. Move it to @@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) * Protected against the rmap code by * the page lock. */ - page_move_anon_rmap(old_page, vma); + page_move_anon_rmap(vmf->page, vma); } - unlock_page(old_page); - return wp_page_reuse(fe, orig_pte, old_page, 0, 0); + unlock_page(vmf->page); + wp_page_reuse(vmf); + return VM_FAULT_WRITE; } - unlock_page(old_page); + unlock_page(vmf->page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - return wp_page_shared(fe, orig_pte, old_page); + return wp_page_shared(vmf); } /* * Ok, we need to copy. Oh, well.. */ - get_page(old_page); + get_page(vmf->page); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -int do_swap_page(struct fault_env *fe, pte_t orig_pte) +int do_swap_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page, *swapcache; struct mem_cgroup *memcg; swp_entry_t entry; @@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) int exclusive = 0; int ret = 0; - if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; - entry = pte_to_swp_entry(orig_pte); + entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { - migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); + migration_entry_wait(vma->vm_mm, vmf->pmd, + vmf->address); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { - print_bad_pte(vma, fe->address, orig_pte, NULL); + print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; @@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, fe->address); + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, + vmf->address); if (!page) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; @@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) } swapcache = page; - locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); + locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { @@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - page = ksm_might_need_to_copy(page, vma, fe->address); + page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; @@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) /* * Back out if somebody else already faulted in this pte. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (unlikely(!pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { @@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - fe->flags &= ~FAULT_FLAG_WRITE; + vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); - if (pte_swp_soft_dirty(orig_pte)) + if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + vmf->orig_pte = pte; if (page == swapcache) { - do_page_add_anon_rmap(page, vma, fe->address, exclusive); + do_page_add_anon_rmap(page, vma, vmf->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) put_page(swapcache); } - if (fe->flags & FAULT_FLAG_WRITE) { - ret |= do_wp_page(fe, pte); + if (vmf->flags & FAULT_FLAG_WRITE) { + ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: return ret; out_nomap: mem_cgroup_cancel_charge(page, memcg, false); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); out_release: @@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_anonymous_page(struct fault_env *fe) +static int do_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; pte_t entry; @@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe) return VM_FAULT_SIGBUS; /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, fe->address) < 0) + if (check_stack_guard_page(vma, vmf->address) < 0) return VM_FAULT_SIGSEGV; /* @@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe) * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) + if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ - if (unlikely(pmd_trans_unstable(fe->pmd))) + if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; /* Use the zero-page for reads */ - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { - entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), + entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_none(*vmf->pte)) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); - return handle_userfault(fe, VM_UFFD_MISSING); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } @@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage_movable(vma, fe->address); + page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!page) goto oom; @@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe) if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (!pte_none(*vmf->pte)) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return handle_userfault(fe, VM_UFFD_MISSING); + return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; release: mem_cgroup_cancel_charge(page, memcg, false); @@ -2843,62 +2863,50 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct fault_env *fe, pgoff_t pgoff, - struct page *cow_page, struct page **page, void **entry) +static int __do_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct vm_fault vmf; + struct vm_area_struct *vma = vmf->vma; int ret; - vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); - vmf.pgoff = pgoff; - vmf.flags = fe->flags; - vmf.page = NULL; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.cow_page = cow_page; - - ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - return ret; - if (ret & VM_FAULT_DAX_LOCKED) { - *entry = vmf.entry; + ret = vma->vm_ops->fault(vma, vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | + VM_FAULT_DONE_COW))) return ret; - } - if (unlikely(PageHWPoison(vmf.page))) { + if (unlikely(PageHWPoison(vmf->page))) { if (ret & VM_FAULT_LOCKED) - unlock_page(vmf.page); - put_page(vmf.page); + unlock_page(vmf->page); + put_page(vmf->page); + vmf->page = NULL; return VM_FAULT_HWPOISON; } if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf.page); + lock_page(vmf->page); else - VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); - *page = vmf.page; return ret; } -static int pte_alloc_one_map(struct fault_env *fe) +static int pte_alloc_one_map(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - if (!pmd_none(*fe->pmd)) + if (!pmd_none(*vmf->pmd)) goto map_pte; - if (fe->prealloc_pte) { - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + if (vmf->prealloc_pte) { + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); goto map_pte; } atomic_long_inc(&vma->vm_mm->nr_ptes); - pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); - spin_unlock(fe->ptl); - fe->prealloc_pte = 0; - } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { + pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + spin_unlock(vmf->ptl); + vmf->prealloc_pte = 0; + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { return VM_FAULT_OOM; } map_pte: @@ -2913,11 +2921,11 @@ map_pte: * through an atomic read in C, which is what pmd_trans_unstable() * provides. */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return VM_FAULT_NOPAGE; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); return 0; } @@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } -static void deposit_prealloc_pte(struct fault_env *fe) +static void deposit_prealloc_pte(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. */ atomic_long_inc(&vma->vm_mm->nr_ptes); - fe->prealloc_pte = 0; + vmf->prealloc_pte = 0; } -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; int i, ret; @@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * Archs like ppc64 need additonal space to store information * related to pte entry. Use the preallocated table for that. */ - if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { - fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); + if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) goto out; for (i = 0; i < HPAGE_PMD_NR; i++) @@ -2990,11 +2998,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * deposit and withdraw with pmd lock held */ if (arch_needs_pgtable_deposit()) - deposit_prealloc_pte(fe); + deposit_prealloc_pte(vmf); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, haddr, fe->pmd); + update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; @@ -3005,13 +3013,13 @@ out: * withdraw with pmd lock held. */ if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) - fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, - fe->pmd); - spin_unlock(fe->ptl); + vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, + vmf->pmd); + spin_unlock(vmf->ptl); return ret; } #else -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { BUILD_BUG(); return 0; @@ -3022,41 +3030,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * alloc_set_pte - setup new PTE entry for given page and add reverse page * mapping. If needed, the fucntion allocates page table or use pre-allocated. * - * @fe: fault environment + * @vmf: fault environment * @memcg: memcg to charge page (only for private mappings) * @page: page to map * - * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. + * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on + * return. * * Target users are page handler itself and implementations of * vm_ops->map_pages. */ -int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, +int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; int ret; - if (pmd_none(*fe->pmd) && PageTransCompound(page) && + if (pmd_none(*vmf->pmd) && PageTransCompound(page) && IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { /* THP on COW? */ VM_BUG_ON_PAGE(memcg, page); - ret = do_set_pmd(fe, page); + ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) goto fault_handled; } - if (!fe->pte) { - ret = pte_alloc_one_map(fe); + if (!vmf->pte) { + ret = pte_alloc_one_map(vmf); if (ret) goto fault_handled; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) { + if (unlikely(!pte_none(*vmf->pte))) { ret = VM_FAULT_NOPAGE; goto fault_handled; } @@ -3068,28 +3077,60 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); ret = 0; fault_handled: /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; + if (vmf->prealloc_pte) { + pte_free(vmf->vma->vm_mm, vmf->prealloc_pte); + vmf->prealloc_pte = 0; } return ret; } + +/** + * finish_fault - finish page fault once we have prepared the page to fault + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a page fault once the + * page to fault in is prepared. It handles locking of PTEs, inserts PTE for + * given page, adds reverse page mapping, handles memcg charges and LRU + * addition. The function returns 0 on success, VM_FAULT_ code in case of + * error. + * + * The function expects the page to be locked and on success it consumes a + * reference of a page being mapped (for the PTE which maps it). + */ +int finish_fault(struct vm_fault *vmf) +{ + struct page *page; + int ret; + + /* Did we COW the page? */ + if ((vmf->flags & FAULT_FLAG_WRITE) && + !(vmf->vma->vm_flags & VM_SHARED)) + page = vmf->cow_page; + else + page = vmf->page; + ret = alloc_set_pte(vmf, vmf->memcg, page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); + return ret; +} + static unsigned long fault_around_bytes __read_mostly = rounddown_pow_of_two(65536); @@ -3154,17 +3195,18 @@ late_initcall(fault_around_debugfs); * fault_around_pages() value (and therefore to page order). This way it's * easier to guarantee that we don't cross page table boundaries. */ -static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) +static int do_fault_around(struct vm_fault *vmf) { - unsigned long address = fe->address, nr_pages, mask; + unsigned long address = vmf->address, nr_pages, mask; + pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off, ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - fe->address = max(address & mask, fe->vma->vm_start); - off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + vmf->address = max(address & mask, vmf->vma->vm_start); + off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* @@ -3172,45 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) * or fault_around_pages() from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - - ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; - end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, + end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); - if (pmd_none(*fe->pmd)) { - fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (pmd_none(*vmf->pmd)) { + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, + vmf->address); + if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); + vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); /* Huge page is mapped? Page fault is solved */ - if (pmd_trans_huge(*fe->pmd)) { + if (pmd_trans_huge(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto out; } /* ->map_pages() haven't done anything useful. Cold page cache? */ - if (!fe->pte) + if (!vmf->pte) goto out; /* check if the page fault is solved */ - fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); - if (!pte_none(*fe->pte)) + vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); + if (!pte_none(*vmf->pte)) ret = VM_FAULT_NOPAGE; - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: - fe->address = address; - fe->pte = NULL; + vmf->address = address; + vmf->pte = NULL; return ret; } -static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_read_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page; + struct vm_area_struct *vma = vmf->vma; int ret = 0; /* @@ -3219,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(fe, pgoff); + ret = do_fault_around(vmf); if (ret) return ret; } - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); - unlock_page(fault_page); + ret |= finish_fault(vmf); + unlock_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - put_page(fault_page); + put_page(vmf->page); return ret; } -static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_cow_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page, *new_page; - void *fault_entry; - struct mem_cgroup *memcg; + struct vm_area_struct *vma = vmf->vma; int ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); - if (!new_page) + vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - put_page(new_page); + if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, + &vmf->memcg, false)) { + put_page(vmf->cow_page); return VM_FAULT_OOM; } - ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; + if (ret & VM_FAULT_DONE_COW) + return ret; - if (!(ret & VM_FAULT_DAX_LOCKED)) - copy_user_highpage(new_page, fault_page, fe->address, vma); - __SetPageUptodate(new_page); + copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); + __SetPageUptodate(vmf->cow_page); - ret |= alloc_set_pte(fe, memcg, new_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); - if (!(ret & VM_FAULT_DAX_LOCKED)) { - unlock_page(fault_page); - put_page(fault_page); - } else { - dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); - } + ret |= finish_fault(vmf); + unlock_page(vmf->page); + put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(new_page, memcg, false); - put_page(new_page); + mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); + put_page(vmf->cow_page); return ret; } -static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_shared_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page; - struct address_space *mapping; - int dirtied = 0; + struct vm_area_struct *vma = vmf->vma; int ret, tmp; - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3301,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) * about to become writable */ if (vma->vm_ops->page_mkwrite) { - unlock_page(fault_page); - tmp = do_page_mkwrite(vma, fault_page, fe->address); + unlock_page(vmf->page); + tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(fault_page); + put_page(vmf->page); return tmp; } } - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); + ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { - unlock_page(fault_page); - put_page(fault_page); + unlock_page(vmf->page); + put_page(vmf->page); return ret; } - if (set_page_dirty(fault_page)) - dirtied = 1; - /* - * Take a local copy of the address_space - page.mapping may be zeroed - * by truncate after unlock_page(). The address_space itself remains - * pinned by vma->vm_file's reference. We rely on unlock_page()'s - * release semantics to prevent the compiler from undoing this copying. - */ - mapping = page_rmapping(fault_page); - unlock_page(fault_page); - if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping but still - * dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!vma->vm_ops->page_mkwrite) - file_update_time(vma->vm_file); - + fault_dirty_shared_page(vma, vmf->page); return ret; } @@ -3350,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_fault(struct fault_env *fe) +static int do_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - pgoff_t pgoff = linear_page_index(vma, fe->address); + struct vm_area_struct *vma = vmf->vma; /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) return VM_FAULT_SIGBUS; - if (!(fe->flags & FAULT_FLAG_WRITE)) - return do_read_fault(fe, pgoff); + if (!(vmf->flags & FAULT_FLAG_WRITE)) + return do_read_fault(vmf); if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(fe, pgoff); - return do_shared_fault(fe, pgoff); + return do_cow_fault(vmf); + return do_shared_fault(vmf); } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, @@ -3380,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct fault_env *fe, pte_t pte) +static int do_numa_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = -1; int last_cpupid; int target_nid; bool migrated = false; + pte_t pte = vmf->orig_pte; bool was_writable = pte_write(pte); int flags = 0; @@ -3400,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * page table entry is not accessible, so there would be no * concurrent hardware modifications to the PTE. */ - fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, pte))) { - pte_unmap_unlock(fe->pte, fe->ptl); + vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -3412,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + update_mmu_cache(vma, vmf->address, vmf->pte); - page = vm_normal_page(vma, fe->address, pte); + page = vm_normal_page(vma, vmf->address, pte); if (!page) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* TODO: handle PTE-mapped THP */ if (PageCompound(page)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3447,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, + target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); if (target_nid == -1) { put_page(page); goto out; @@ -3469,28 +3476,28 @@ out: return 0; } -static int create_huge_pmd(struct fault_env *fe) +static int create_huge_pmd(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma_is_anonymous(vma)) - return do_huge_pmd_anonymous_page(fe); + return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, - fe->flags); + return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, + vmf->flags); return VM_FAULT_FALLBACK; } -static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) +static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { - if (vma_is_anonymous(fe->vma)) - return do_huge_pmd_wp_page(fe, orig_pmd); - if (fe->vma->vm_ops->pmd_fault) - return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, - fe->flags); + if (vma_is_anonymous(vmf->vma)) + return do_huge_pmd_wp_page(vmf, orig_pmd); + if (vmf->vma->vm_ops->pmd_fault) + return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, + vmf->pmd, vmf->flags); /* COW handled on pte level: split pmd */ - VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); - __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); + VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); + __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } @@ -3515,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) * The mmap_sem may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */ -static int handle_pte_fault(struct fault_env *fe) +static int handle_pte_fault(struct vm_fault *vmf) { pte_t entry; - if (unlikely(pmd_none(*fe->pmd))) { + if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table * for an instant, it will be difficult to retract from * concurrent faults and from rmap lookups. */ - fe->pte = NULL; + vmf->pte = NULL; } else { /* See comment in pte_alloc_one_map() */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return 0; /* * A regular pmd is established and it can't morph into a huge @@ -3537,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe) * mmap_sem read mode and khugepaged takes it in write mode. * So now it's safe to run pte_offset_map(). */ - fe->pte = pte_offset_map(fe->pmd, fe->address); - - entry = *fe->pte; + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + vmf->orig_pte = *vmf->pte; /* * some architectures can have larger ptes than wordsize, @@ -3550,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe) * ptl lock held. So here a barrier will do. */ barrier(); - if (pte_none(entry)) { - pte_unmap(fe->pte); - fe->pte = NULL; + if (pte_none(vmf->orig_pte)) { + pte_unmap(vmf->pte); + vmf->pte = NULL; } } - if (!fe->pte) { - if (vma_is_anonymous(fe->vma)) - return do_anonymous_page(fe); + if (!vmf->pte) { + if (vma_is_anonymous(vmf->vma)) + return do_anonymous_page(vmf); else - return do_fault(fe); + return do_fault(vmf); } - if (!pte_present(entry)) - return do_swap_page(fe, entry); + if (!pte_present(vmf->orig_pte)) + return do_swap_page(vmf); - if (pte_protnone(entry) && vma_is_accessible(fe->vma)) - return do_numa_page(fe, entry); + if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) + return do_numa_page(vmf); - fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, entry))) + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + entry = vmf->orig_pte; + if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; - if (fe->flags & FAULT_FLAG_WRITE) { + if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) - return do_wp_page(fe, entry); + return do_wp_page(vmf); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, - fe->flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(fe->vma, fe->address, fe->pte); + if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, + vmf->flags & FAULT_FLAG_WRITE)) { + update_mmu_cache(vmf->vma, vmf->address, vmf->pte); } else { /* * This is needed only for protection faults but the arch code @@ -3589,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe) * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (fe->flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(fe->vma, fe->address); + if (vmf->flags & FAULT_FLAG_WRITE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); } unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3606,10 +3613,12 @@ unlock: static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, - .address = address, + .address = address & PAGE_MASK, .flags = flags, + .pgoff = linear_page_index(vma, address), + .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -3619,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, pud = pud_alloc(mm, pgd, address); if (!pud) return VM_FAULT_OOM; - fe.pmd = pmd_alloc(mm, pud, address); - if (!fe.pmd) + vmf.pmd = pmd_alloc(mm, pud, address); + if (!vmf.pmd) return VM_FAULT_OOM; - if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { - int ret = create_huge_pmd(&fe); + if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { + int ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - pmd_t orig_pmd = *fe.pmd; + pmd_t orig_pmd = *vmf.pmd; int ret; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&fe, orig_pmd); + return do_huge_pmd_numa_page(&vmf, orig_pmd); - if ((fe.flags & FAULT_FLAG_WRITE) && + if ((vmf.flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { - ret = wp_huge_pmd(&fe, orig_pmd); + ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(&fe, orig_pmd); + huge_pmd_set_accessed(&vmf, orig_pmd); return 0; } } } - return handle_pte_fault(&fe); + return handle_pte_fault(&vmf); } /* @@ -3808,8 +3817,8 @@ out: return -EINVAL; } -static inline int follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, + spinlock_t **ptlp) { int res; @@ -3919,7 +3928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - gup_flags, &page, &vma); + gup_flags, &page, &vma, NULL); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; diff --git a/mm/nommu.c b/mm/nommu.c index 27bc543128e5..210d7ec2843c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages_locked); -long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; down_read(&mm->mmap_sem); @@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) @@ -1801,7 +1801,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index be8dc8d1edb9..84d0c7eada2b 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr, ssize_t rc = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - unsigned int flags = FOLL_REMOTE; + unsigned int flags = 0; /* Work out address and page range required */ if (len == 0) @@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr, while (!rc && nr_pages && iov_iter_count(iter)) { int pages = min(nr_pages, max_pages_per_loop); + int locked = 1; size_t bytes; /* * Get the pages we're interested in. We must - * add FOLL_REMOTE because task/mm might not + * access remotely because task/mm might not * current/current->mm */ - pages = __get_user_pages_unlocked(task, mm, pa, pages, - process_pages, flags); + down_read(&mm->mmap_sem); + pages = get_user_pages_remote(task, mm, pa, pages, flags, + process_pages, NULL, &locked); + if (locked) + up_read(&mm->mmap_sem); if (pages <= 0) return -EFAULT; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 682b73af7766..838ffa78cfda 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, * the execve(). */ if (get_user_pages_remote(current, bprm->mm, pos, 1, - FOLL_FORCE, &page, NULL) <= 0) + FOLL_FORCE, &page, NULL, NULL) <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index d08e214ec6e7..be93ab02b490 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -719,14 +719,14 @@ sub set_value { if ($buildonly && $lvalue =~ /^TEST_TYPE(\[.*\])?$/ && $prvalue ne "build") { # Note if a test is something other than build, then we - # will need other manditory options. + # will need other mandatory options. if ($prvalue ne "install") { # for bisect, we need to check BISECT_TYPE if ($prvalue ne "bisect") { $buildonly = 0; } } else { - # install still limits some manditory options. + # install still limits some mandatory options. $buildonly = 2; } } @@ -735,7 +735,7 @@ sub set_value { if ($prvalue ne "install") { $buildonly = 0; } else { - # install still limits some manditory options. + # install still limits some mandatory options. $buildonly = 2; } } @@ -3989,7 +3989,7 @@ sub make_min_config { } } - # Save off all the current mandidory configs + # Save off all the current mandatory configs open (OUT, ">$temp_config") or die "Can't write to $temp_config"; foreach my $config (keys %keep_configs) { diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index efeceb0a222d..3815e940fbea 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -76,16 +76,20 @@ static void async_pf_execute(struct work_struct *work) struct kvm_vcpu *vcpu = apf->vcpu; unsigned long addr = apf->addr; gva_t gva = apf->gva; + int locked = 1; might_sleep(); /* * This work is run asynchromously to the task which owns * mm and might be done in another context, so we must - * use FOLL_REMOTE. + * access remotely. */ - __get_user_pages_unlocked(NULL, mm, addr, 1, NULL, - FOLL_WRITE | FOLL_REMOTE); + down_read(&mm->mmap_sem); + get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL, + &locked); + if (locked) + up_read(&mm->mmap_sem); kvm_async_page_present_sync(vcpu, apf); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9e81531e85f5..a52ac3440b7b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1415,13 +1415,12 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, npages = get_user_page_nowait(addr, write_fault, page); up_read(¤t->mm->mmap_sem); } else { - unsigned int flags = FOLL_TOUCH | FOLL_HWPOISON; + unsigned int flags = FOLL_HWPOISON; if (write_fault) flags |= FOLL_WRITE; - npages = __get_user_pages_unlocked(current, current->mm, addr, 1, - page, flags); + npages = get_user_pages_unlocked(addr, 1, page, flags); } if (npages != 1) return npages; |