summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c237
1 files changed, 200 insertions, 37 deletions
diff --git a/mm/memory.c b/mm/memory.c
index e8bfdf0d9d1d..f703fe8c8346 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -733,6 +733,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(*src_pte))
pte = pte_swp_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(*src_pte))
+ pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
} else if (is_device_private_entry(entry)) {
@@ -762,6 +764,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
is_cow_mapping(vm_flags)) {
make_device_private_entry_read(&entry);
pte = swp_entry_to_pte(entry);
+ if (pte_swp_uffd_wp(*src_pte))
+ pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
}
@@ -785,6 +789,14 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
+ /*
+ * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
+ * does not have the VM_UFFD_WP, which means that the uffd
+ * fork event is not enabled.
+ */
+ if (!(vm_flags & VM_UFFD_WP))
+ pte = pte_clear_uffd_wp(pte);
+
page = vm_normal_page(vma, addr, pte);
if (page) {
get_page(page);
@@ -1407,8 +1419,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
-pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
- spinlock_t **ptl)
+static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -1427,9 +1438,40 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
return NULL;
VM_BUG_ON(pmd_trans_huge(*pmd));
+ return pmd;
+}
+
+pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
+ spinlock_t **ptl)
+{
+ pmd_t *pmd = walk_to_pmd(mm, addr);
+
+ if (!pmd)
+ return NULL;
return pte_alloc_map_lock(mm, pmd, addr, ptl);
}
+static int validate_page_before_insert(struct page *page)
+{
+ if (PageAnon(page) || PageSlab(page) || page_has_type(page))
+ return -EINVAL;
+ flush_dcache_page(page);
+ return 0;
+}
+
+static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
+ unsigned long addr, struct page *page, pgprot_t prot)
+{
+ if (!pte_none(*pte))
+ return -EBUSY;
+ /* Ok, finally just insert the thing.. */
+ get_page(page);
+ inc_mm_counter_fast(mm, mm_counter_file(page));
+ page_add_file_rmap(page, false);
+ set_pte_at(mm, addr, pte, mk_pte(page, prot));
+ return 0;
+}
+
/*
* This is the old fallback for page remapping.
*
@@ -1445,31 +1487,135 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
pte_t *pte;
spinlock_t *ptl;
- retval = -EINVAL;
- if (PageAnon(page) || PageSlab(page) || page_has_type(page))
+ retval = validate_page_before_insert(page);
+ if (retval)
goto out;
retval = -ENOMEM;
- flush_dcache_page(page);
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
goto out;
- retval = -EBUSY;
- if (!pte_none(*pte))
- goto out_unlock;
-
- /* Ok, finally just insert the thing.. */
- get_page(page);
- inc_mm_counter_fast(mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
- set_pte_at(mm, addr, pte, mk_pte(page, prot));
-
- retval = 0;
-out_unlock:
+ retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
out:
return retval;
}
+#ifdef pte_index
+static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, struct page *page, pgprot_t prot)
+{
+ int err;
+
+ if (!page_count(page))
+ return -EINVAL;
+ err = validate_page_before_insert(page);
+ return err ? err : insert_page_into_pte_locked(
+ mm, pte_offset_map(pmd, addr), addr, page, prot);
+}
+
+/* insert_pages() amortizes the cost of spinlock operations
+ * when inserting pages in a loop. Arch *must* define pte_index.
+ */
+static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num, pgprot_t prot)
+{
+ pmd_t *pmd = NULL;
+ spinlock_t *pte_lock = NULL;
+ struct mm_struct *const mm = vma->vm_mm;
+ unsigned long curr_page_idx = 0;
+ unsigned long remaining_pages_total = *num;
+ unsigned long pages_to_write_in_pmd;
+ int ret;
+more:
+ ret = -EFAULT;
+ pmd = walk_to_pmd(mm, addr);
+ if (!pmd)
+ goto out;
+
+ pages_to_write_in_pmd = min_t(unsigned long,
+ remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
+
+ /* Allocate the PTE if necessary; takes PMD lock once only. */
+ ret = -ENOMEM;
+ if (pte_alloc(mm, pmd))
+ goto out;
+ pte_lock = pte_lockptr(mm, pmd);
+
+ while (pages_to_write_in_pmd) {
+ int pte_idx = 0;
+ const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
+
+ spin_lock(pte_lock);
+ for (; pte_idx < batch_size; ++pte_idx) {
+ int err = insert_page_in_batch_locked(mm, pmd,
+ addr, pages[curr_page_idx], prot);
+ if (unlikely(err)) {
+ spin_unlock(pte_lock);
+ ret = err;
+ remaining_pages_total -= pte_idx;
+ goto out;
+ }
+ addr += PAGE_SIZE;
+ ++curr_page_idx;
+ }
+ spin_unlock(pte_lock);
+ pages_to_write_in_pmd -= batch_size;
+ remaining_pages_total -= batch_size;
+ }
+ if (remaining_pages_total)
+ goto more;
+ ret = 0;
+out:
+ *num = remaining_pages_total;
+ return ret;
+}
+#endif /* ifdef pte_index */
+
+/**
+ * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
+ * @vma: user vma to map to
+ * @addr: target start user address of these pages
+ * @pages: source kernel pages
+ * @num: in: number of pages to map. out: number of pages that were *not*
+ * mapped. (0 means all pages were successfully mapped).
+ *
+ * Preferred over vm_insert_page() when inserting multiple pages.
+ *
+ * In case of error, we may have mapped a subset of the provided
+ * pages. It is the caller's responsibility to account for this case.
+ *
+ * The same restrictions apply as in vm_insert_page().
+ */
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num)
+{
+#ifdef pte_index
+ const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
+
+ if (addr < vma->vm_start || end_addr >= vma->vm_end)
+ return -EFAULT;
+ if (!(vma->vm_flags & VM_MIXEDMAP)) {
+ BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+ BUG_ON(vma->vm_flags & VM_PFNMAP);
+ vma->vm_flags |= VM_MIXEDMAP;
+ }
+ /* Defer page refcount checking till we're about to map that page. */
+ return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
+#else
+ unsigned long idx = 0, pgcount = *num;
+ int err;
+
+ for (; idx < pgcount; ++idx) {
+ err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
+ if (err)
+ break;
+ }
+ *num = pgcount - idx;
+ return err;
+#endif /* ifdef pte_index */
+}
+EXPORT_SYMBOL(vm_insert_pages);
+
/**
* vm_insert_page - insert single page into user vma
* @vma: user vma to map to
@@ -1939,8 +2085,8 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
* @addr: target user address to start at
- * @pfn: physical address of kernel memory
- * @size: size of map area
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
* @prot: page protection flags for this mapping
*
* Note: this is only safe if the mm semaphore is held when called.
@@ -2009,7 +2155,7 @@ EXPORT_SYMBOL(remap_pfn_range);
/**
* vm_iomap_memory - remap memory to userspace
* @vma: user vma to map to
- * @start: start of area
+ * @start: start of the physical memory to be mapped
* @len: size of area
*
* This is a simplified io_remap_pfn_range() for common driver use. The
@@ -2752,6 +2898,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
+ if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_WP);
+ }
+
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
if (!vmf->page) {
/*
@@ -3085,6 +3236,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(vmf->orig_pte)) {
+ pte = pte_mkuffd_wp(pte);
+ pte = pte_wrprotect(pte);
+ }
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
@@ -3373,7 +3528,7 @@ map_pte:
return 0;
}
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -3475,8 +3630,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
pte_t entry;
vm_fault_t ret;
- if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
- IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
+ if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
/* THP on COW? */
VM_BUG_ON_PAGE(memcg, page);
@@ -3949,31 +4103,40 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
- if (vma_is_anonymous(vmf->vma))
+ if (vma_is_anonymous(vmf->vma)) {
+ if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
+ return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf, orig_pmd);
- if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+ }
+ if (vmf->vma->vm_ops->huge_fault) {
+ vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ }
- /* COW handled on pte level: split pmd */
- VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
+ /* COW or write-notify handled on pte level: split pmd. */
__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
-static inline bool vma_is_accessible(struct vm_area_struct *vma)
-{
- return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
-}
-
static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
- return VM_FAULT_FALLBACK;
- if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+ goto split;
+ if (vmf->vma->vm_ops->huge_fault) {
+ vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ }
+split:
+ /* COW or write-notify not handled on PUD level: split pud.*/
+ __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}