diff options
Diffstat (limited to 'mm')
58 files changed, 6397 insertions, 1896 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 48b1af447fa7..9c4bdddd80c2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -262,6 +262,9 @@ config MIGRATION config ARCH_ENABLE_HUGEPAGE_MIGRATION bool +config ARCH_ENABLE_THP_MIGRATION + bool + config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT @@ -673,11 +676,12 @@ config ARCH_HAS_ZONE_DEVICE bool config ZONE_DEVICE - bool "Device memory (pmem, etc...) hotplug support" + bool "Device memory (pmem, HMM, etc...) hotplug support" depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP depends on ARCH_HAS_ZONE_DEVICE + select RADIX_TREE_MULTIORDER help Device memory hotplug support allows for establishing pmem, @@ -688,6 +692,55 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. +config ARCH_HAS_HMM + bool + default y + depends on (X86_64 || PPC64) + depends on ZONE_DEVICE + depends on MMU && 64BIT + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP + +config MIGRATE_VMA_HELPER + bool + +config HMM + bool + select MIGRATE_VMA_HELPER + +config HMM_MIRROR + bool "HMM mirror CPU page table into a device page table" + depends on ARCH_HAS_HMM + select MMU_NOTIFIER + select HMM + help + Select HMM_MIRROR if you want to mirror range of the CPU page table of a + process into a device page table. Here, mirror means "keep synchronized". + Prerequisites: the device must provide the ability to write-protect its + page tables (at PAGE_SIZE granularity), and must be able to recover from + the resulting potential page faults. + +config DEVICE_PRIVATE + bool "Unaddressable device memory (GPU memory, ...)" + depends on ARCH_HAS_HMM + select HMM + + help + Allows creation of struct pages to represent unaddressable device + memory; i.e., memory that is only accessible from the device (or + group of devices). You likely also want to select HMM_MIRROR. + +config DEVICE_PUBLIC + bool "Addressable device memory (like GPU memory)" + depends on ARCH_HAS_HMM + select HMM + + help + Allows creation of struct pages to represent addressable device + memory; i.e., memory that is accessible from both the device and + the CPU + config FRAME_VECTOR bool diff --git a/mm/Makefile b/mm/Makefile index 411bd24d4a7c..e3ac3aeb533b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -104,3 +104,4 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o +obj-$(CONFIG_HMM) += hmm.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f028a9a472fd..e19606bb41a0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -569,8 +569,10 @@ static int cgwb_create(struct backing_dev_info *bdi, /* need to create a new one */ wb = kmalloc(sizeof(*wb), gfp); - if (!wb) - return -ENOMEM; + if (!wb) { + ret = -ENOMEM; + goto out_put; + } ret = wb_init(wb, bdi, blkcg_css->id, gfp); if (ret) diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index b06d9fe23a28..68d28924ba79 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -139,6 +139,14 @@ int balloon_page_migrate(struct address_space *mapping, { struct balloon_dev_info *balloon = balloon_page_device(page); + /* + * We can not easily support the no copy case here so ignore it as it + * is unlikely to be use with ballon pages. See include/linux/hmm.h for + * user of the MIGRATE_SYNC_NO_COPY mode. + */ + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 6d5717bd7197..b1dd4a948fc0 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -30,6 +30,13 @@ early_param("early_ioremap_debug", early_ioremap_debug_setup); static int after_paging_init __initdata; +pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + return prot; +} + void __init __weak early_ioremap_shutdown(void) { } @@ -215,14 +222,29 @@ early_ioremap(resource_size_t phys_addr, unsigned long size) void __init * early_memremap(resource_size_t phys_addr, unsigned long size) { - return (__force void *)__early_ioremap(phys_addr, size, - FIXMAP_PAGE_NORMAL); + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, + FIXMAP_PAGE_NORMAL); + + return (__force void *)__early_ioremap(phys_addr, size, prot); } #ifdef FIXMAP_PAGE_RO void __init * early_memremap_ro(resource_size_t phys_addr, unsigned long size) { - return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, + FIXMAP_PAGE_RO); + + return (__force void *)__early_ioremap(phys_addr, size, prot); +} +#endif + +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +void __init * +early_memremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return (__force void *)__early_ioremap(phys_addr, size, + __pgprot(prot_val)); } #endif diff --git a/mm/fadvise.c b/mm/fadvise.c index a43013112581..702f239cd6db 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -52,7 +52,9 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) goto out; } - if (IS_DAX(inode)) { + bdi = inode_to_bdi(mapping->host); + + if (IS_DAX(inode) || (bdi == &noop_backing_dev_info)) { switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: @@ -75,8 +77,6 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) else endbyte--; /* inclusive */ - bdi = inode_to_bdi(mapping->host); - switch (advice) { case POSIX_FADV_NORMAL: f.file->f_ra.ra_pages = bdi->ra_pages; diff --git a/mm/filemap.c b/mm/filemap.c index 92d4e0a6c012..870971e20967 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space *mapping, return -EEXIST; mapping->nrexceptional--; - if (!dax_mapping(mapping)) { - if (shadowp) - *shadowp = p; - } else { - /* DAX can replace empty locked entry with a hole */ - WARN_ON_ONCE(p != - dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); - /* Wakeup waiters for exceptional entry lock */ - dax_wake_mapping_entry_waiter(mapping, page->index, p, - true); - } + if (shadowp) + *shadowp = p; } __radix_tree_replace(&mapping->page_tree, node, slot, page, workingset_update_node, mapping); @@ -402,8 +393,7 @@ bool filemap_range_has_page(struct address_space *mapping, { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; - struct pagevec pvec; - bool ret; + struct page *page; if (end_byte < start_byte) return false; @@ -411,12 +401,10 @@ bool filemap_range_has_page(struct address_space *mapping, if (mapping->nrpages == 0) return false; - pagevec_init(&pvec, 0); - if (!pagevec_lookup(&pvec, mapping, index, 1)) + if (!find_get_pages_range(mapping, &index, end, 1, &page)) return false; - ret = (pvec.pages[0]->index <= end); - pagevec_release(&pvec); - return ret; + put_page(page); + return true; } EXPORT_SYMBOL(filemap_range_has_page); @@ -476,6 +464,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, EXPORT_SYMBOL(filemap_fdatawait_range); /** + * file_fdatawait_range - wait for writeback to complete + * @file: file pointing to address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the address space that file + * refers to, in the given range and wait for all of them. Check error + * status of the address space vs. the file->f_wb_err cursor and return it. + * + * Since the error status of the file is advanced by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. + */ +int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) +{ + struct address_space *mapping = file->f_mapping; + + __filemap_fdatawait_range(mapping, start_byte, end_byte); + return file_check_and_advance_wb_err(file); +} +EXPORT_SYMBOL(file_fdatawait_range); + +/** * filemap_fdatawait_keep_errors - wait for writeback without clearing errors * @mapping: address space structure to wait for * @@ -489,45 +500,22 @@ EXPORT_SYMBOL(filemap_fdatawait_range); */ int filemap_fdatawait_keep_errors(struct address_space *mapping) { - loff_t i_size = i_size_read(mapping->host); - - if (i_size == 0) - return 0; - - __filemap_fdatawait_range(mapping, 0, i_size - 1); + __filemap_fdatawait_range(mapping, 0, LLONG_MAX); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_keep_errors); -/** - * filemap_fdatawait - wait for all under-writeback pages to complete - * @mapping: address space structure to wait for - * - * Walk the list of under-writeback pages of the given address space - * and wait for all of them. Check error status of the address space - * and return it. - * - * Since the error status of the address space is cleared by this function, - * callers are responsible for checking the return value and handling and/or - * reporting the error. - */ -int filemap_fdatawait(struct address_space *mapping) +static bool mapping_needs_writeback(struct address_space *mapping) { - loff_t i_size = i_size_read(mapping->host); - - if (i_size == 0) - return 0; - - return filemap_fdatawait_range(mapping, 0, i_size - 1); + return (!dax_mapping(mapping) && mapping->nrpages) || + (dax_mapping(mapping) && mapping->nrexceptional); } -EXPORT_SYMBOL(filemap_fdatawait); int filemap_write_and_wait(struct address_space *mapping) { int err = 0; - if ((!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional)) { + if (mapping_needs_writeback(mapping)) { err = filemap_fdatawrite(mapping); /* * Even if the above returned error, the pages may be @@ -566,8 +554,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0; - if ((!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional)) { + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ @@ -589,7 +576,7 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); void __filemap_set_wb_err(struct address_space *mapping, int err) { - errseq_t eseq = __errseq_set(&mapping->wb_err, err); + errseq_t eseq = errseq_set(&mapping->wb_err, err); trace_filemap_set_wb_err(mapping, eseq); } @@ -656,8 +643,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) int err = 0, err2; struct address_space *mapping = file->f_mapping; - if ((!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional)) { + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ @@ -923,13 +909,33 @@ static void wake_up_page_bit(struct page *page, int bit_nr) wait_queue_head_t *q = page_waitqueue(page); struct wait_page_key key; unsigned long flags; + wait_queue_entry_t bookmark; key.page = page; key.bit_nr = bit_nr; key.page_match = 0; + bookmark.flags = 0; + bookmark.private = NULL; + bookmark.func = NULL; + INIT_LIST_HEAD(&bookmark.entry); + spin_lock_irqsave(&q->lock, flags); - __wake_up_locked_key(q, TASK_NORMAL, &key); + __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); + + while (bookmark.flags & WQ_FLAG_BOOKMARK) { + /* + * Take a breather from holding the lock, + * allow pages that finish wake up asynchronously + * to acquire the lock and remove themselves + * from wait queue + */ + spin_unlock_irqrestore(&q->lock, flags); + cpu_relax(); + spin_lock_irqsave(&q->lock, flags); + __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); + } + /* * It is possible for other pages to have collided on the waitqueue * hash, so in that case check for a page match. That prevents a long- @@ -1041,7 +1047,7 @@ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter) unsigned long flags; spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, waiter); + __add_wait_queue_entry_tail(q, waiter); SetPageWaiters(page); spin_unlock_irqrestore(&q->lock, flags); } @@ -1566,23 +1572,29 @@ export: } /** - * find_get_pages - gang pagecache lookup + * find_get_pages_range - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page index + * @end: The final page index (inclusive) * @nr_pages: The maximum number of pages * @pages: Where the resulting pages are placed * - * find_get_pages() will search for and return a group of up to - * @nr_pages pages in the mapping. The pages are placed at @pages. - * find_get_pages() takes a reference against the returned pages. + * find_get_pages_range() will search for and return a group of up to @nr_pages + * pages in the mapping starting at index @start and up to index @end + * (inclusive). The pages are placed at @pages. find_get_pages_range() takes + * a reference against the returned pages. * * The search returns a group of mapping-contiguous pages with ascending * indexes. There may be holes in the indices due to not-present pages. + * We also update @start to index the next page for the traversal. * - * find_get_pages() returns the number of pages which were found. + * find_get_pages_range() returns the number of pages which were found. If this + * number is smaller than @nr_pages, the end of specified range has been + * reached. */ -unsigned find_get_pages(struct address_space *mapping, pgoff_t start, - unsigned int nr_pages, struct page **pages) +unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, + pgoff_t end, unsigned int nr_pages, + struct page **pages) { struct radix_tree_iter iter; void **slot; @@ -1592,8 +1604,11 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) { struct page *head, *page; + + if (iter.index > end) + break; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1629,11 +1644,25 @@ repeat: } pages[ret] = page; - if (++ret == nr_pages) - break; + if (++ret == nr_pages) { + *start = pages[ret - 1]->index + 1; + goto out; + } } + /* + * We come here when there is no page beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is page at index -1 but that is + * already broken anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: rcu_read_unlock(); + return ret; } @@ -234,6 +234,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } +retry: + if (!pmd_present(*pmd)) { + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(*pmd)); + if (is_pmd_migration_entry(*pmd)) + pmd_migration_entry_wait(mm, pmd); + goto retry; + } if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); @@ -247,7 +257,15 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); +retry_locked: ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_present(*pmd))) { + spin_unlock(ptl); + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + pmd_migration_entry_wait(mm, pmd); + goto retry_locked; + } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags); @@ -424,7 +442,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pud = pud_offset(p4d, address); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) return -EFAULT; VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, address); @@ -438,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) goto unmap; *page = pte_page(*pte); + + /* + * This should never happen (a device public page in the gate + * area). + */ + if (is_device_public_page(*page)) + goto unmap; } get_page(*page); out: @@ -1352,7 +1377,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* __HAVE_ARCH_PTE_SPECIAL */ -#ifdef __HAVE_ARCH_PTE_DEVMAP +#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, struct page **pages, int *nr) { @@ -1534,7 +1559,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + if (!pmd_present(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { diff --git a/mm/hmm.c b/mm/hmm.c new file mode 100644 index 000000000000..a88a847bccba --- /dev/null +++ b/mm/hmm.c @@ -0,0 +1,1257 @@ +/* + * Copyright 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Jérôme Glisse <jglisse@redhat.com> + */ +/* + * Refer to include/linux/hmm.h for information about heterogeneous memory + * management or HMM for short. + */ +#include <linux/mm.h> +#include <linux/hmm.h> +#include <linux/init.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/mmzone.h> +#include <linux/pagemap.h> +#include <linux/swapops.h> +#include <linux/hugetlb.h> +#include <linux/memremap.h> +#include <linux/jump_label.h> +#include <linux/mmu_notifier.h> +#include <linux/memory_hotplug.h> + +#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) + +#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) +/* + * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h + */ +DEFINE_STATIC_KEY_FALSE(device_private_key); +EXPORT_SYMBOL(device_private_key); +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ + + +#if IS_ENABLED(CONFIG_HMM_MIRROR) +static const struct mmu_notifier_ops hmm_mmu_notifier_ops; + +/* + * struct hmm - HMM per mm struct + * + * @mm: mm struct this HMM struct is bound to + * @lock: lock protecting ranges list + * @sequence: we track updates to the CPU page table with a sequence number + * @ranges: list of range being snapshotted + * @mirrors: list of mirrors for this mm + * @mmu_notifier: mmu notifier to track updates to CPU page table + * @mirrors_sem: read/write semaphore protecting the mirrors list + */ +struct hmm { + struct mm_struct *mm; + spinlock_t lock; + atomic_t sequence; + struct list_head ranges; + struct list_head mirrors; + struct mmu_notifier mmu_notifier; + struct rw_semaphore mirrors_sem; +}; + +/* + * hmm_register - register HMM against an mm (HMM internal) + * + * @mm: mm struct to attach to + * + * This is not intended to be used directly by device drivers. It allocates an + * HMM struct if mm does not have one, and initializes it. + */ +static struct hmm *hmm_register(struct mm_struct *mm) +{ + struct hmm *hmm = READ_ONCE(mm->hmm); + bool cleanup = false; + + /* + * The hmm struct can only be freed once the mm_struct goes away, + * hence we should always have pre-allocated an new hmm struct + * above. + */ + if (hmm) + return hmm; + + hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); + if (!hmm) + return NULL; + INIT_LIST_HEAD(&hmm->mirrors); + init_rwsem(&hmm->mirrors_sem); + atomic_set(&hmm->sequence, 0); + hmm->mmu_notifier.ops = NULL; + INIT_LIST_HEAD(&hmm->ranges); + spin_lock_init(&hmm->lock); + hmm->mm = mm; + + /* + * We should only get here if hold the mmap_sem in write mode ie on + * registration of first mirror through hmm_mirror_register() + */ + hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; + if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { + kfree(hmm); + return NULL; + } + + spin_lock(&mm->page_table_lock); + if (!mm->hmm) + mm->hmm = hmm; + else + cleanup = true; + spin_unlock(&mm->page_table_lock); + + if (cleanup) { + mmu_notifier_unregister(&hmm->mmu_notifier, mm); + kfree(hmm); + } + + return mm->hmm; +} + +void hmm_mm_destroy(struct mm_struct *mm) +{ + kfree(mm->hmm); +} + +static void hmm_invalidate_range(struct hmm *hmm, + enum hmm_update_type action, + unsigned long start, + unsigned long end) +{ + struct hmm_mirror *mirror; + struct hmm_range *range; + + spin_lock(&hmm->lock); + list_for_each_entry(range, &hmm->ranges, list) { + unsigned long addr, idx, npages; + + if (end < range->start || start >= range->end) + continue; + + range->valid = false; + addr = max(start, range->start); + idx = (addr - range->start) >> PAGE_SHIFT; + npages = (min(range->end, end) - addr) >> PAGE_SHIFT; + memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); + } + spin_unlock(&hmm->lock); + + down_read(&hmm->mirrors_sem); + list_for_each_entry(mirror, &hmm->mirrors, list) + mirror->ops->sync_cpu_device_pagetables(mirror, action, + start, end); + up_read(&hmm->mirrors_sem); +} + +static void hmm_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct hmm *hmm = mm->hmm; + + VM_BUG_ON(!hmm); + + atomic_inc(&hmm->sequence); +} + +static void hmm_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct hmm *hmm = mm->hmm; + + VM_BUG_ON(!hmm); + + hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end); +} + +static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { + .invalidate_range_start = hmm_invalidate_range_start, + .invalidate_range_end = hmm_invalidate_range_end, +}; + +/* + * hmm_mirror_register() - register a mirror against an mm + * + * @mirror: new mirror struct to register + * @mm: mm to register against + * + * To start mirroring a process address space, the device driver must register + * an HMM mirror struct. + * + * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! + */ +int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) +{ + /* Sanity check */ + if (!mm || !mirror || !mirror->ops) + return -EINVAL; + + mirror->hmm = hmm_register(mm); + if (!mirror->hmm) + return -ENOMEM; + + down_write(&mirror->hmm->mirrors_sem); + list_add(&mirror->list, &mirror->hmm->mirrors); + up_write(&mirror->hmm->mirrors_sem); + + return 0; +} +EXPORT_SYMBOL(hmm_mirror_register); + +/* + * hmm_mirror_unregister() - unregister a mirror + * + * @mirror: new mirror struct to register + * + * Stop mirroring a process address space, and cleanup. + */ +void hmm_mirror_unregister(struct hmm_mirror *mirror) +{ + struct hmm *hmm = mirror->hmm; + + down_write(&hmm->mirrors_sem); + list_del(&mirror->list); + up_write(&hmm->mirrors_sem); +} +EXPORT_SYMBOL(hmm_mirror_unregister); + +struct hmm_vma_walk { + struct hmm_range *range; + unsigned long last; + bool fault; + bool block; + bool write; +}; + +static int hmm_vma_do_fault(struct mm_walk *walk, + unsigned long addr, + hmm_pfn_t *pfn) +{ + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct vm_area_struct *vma = walk->vma; + int r; + + flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; + flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0; + r = handle_mm_fault(vma, addr, flags); + if (r & VM_FAULT_RETRY) + return -EBUSY; + if (r & VM_FAULT_ERROR) { + *pfn = HMM_PFN_ERROR; + return -EFAULT; + } + + return -EAGAIN; +} + +static void hmm_pfns_special(hmm_pfn_t *pfns, + unsigned long addr, + unsigned long end) +{ + for (; addr < end; addr += PAGE_SIZE, pfns++) + *pfns = HMM_PFN_SPECIAL; +} + +static int hmm_pfns_bad(unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_range *range = walk->private; + hmm_pfn_t *pfns = range->pfns; + unsigned long i; + + i = (addr - range->start) >> PAGE_SHIFT; + for (; addr < end; addr += PAGE_SIZE, i++) + pfns[i] = HMM_PFN_ERROR; + + return 0; +} + +static void hmm_pfns_clear(hmm_pfn_t *pfns, + unsigned long addr, + unsigned long end) +{ + for (; addr < end; addr += PAGE_SIZE, pfns++) + *pfns = 0; +} + +static int hmm_vma_walk_hole(unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + hmm_pfn_t *pfns = range->pfns; + unsigned long i; + + hmm_vma_walk->last = addr; + i = (addr - range->start) >> PAGE_SHIFT; + for (; addr < end; addr += PAGE_SIZE, i++) { + pfns[i] = HMM_PFN_EMPTY; + if (hmm_vma_walk->fault) { + int ret; + + ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + if (ret != -EAGAIN) + return ret; + } + } + + return hmm_vma_walk->fault ? -EAGAIN : 0; +} + +static int hmm_vma_walk_clear(unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + hmm_pfn_t *pfns = range->pfns; + unsigned long i; + + hmm_vma_walk->last = addr; + i = (addr - range->start) >> PAGE_SHIFT; + for (; addr < end; addr += PAGE_SIZE, i++) { + pfns[i] = 0; + if (hmm_vma_walk->fault) { + int ret; + + ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + if (ret != -EAGAIN) + return ret; + } + } + + return hmm_vma_walk->fault ? -EAGAIN : 0; +} + +static int hmm_vma_walk_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + hmm_pfn_t *pfns = range->pfns; + unsigned long addr = start, i; + bool write_fault; + hmm_pfn_t flag; + pte_t *ptep; + + i = (addr - range->start) >> PAGE_SHIFT; + flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0; + write_fault = hmm_vma_walk->fault & hmm_vma_walk->write; + +again: + if (pmd_none(*pmdp)) + return hmm_vma_walk_hole(start, end, walk); + + if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB) + return hmm_pfns_bad(start, end, walk); + + if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { + unsigned long pfn; + pmd_t pmd; + + /* + * No need to take pmd_lock here, even if some other threads + * is splitting the huge pmd we will get that event through + * mmu_notifier callback. + * + * So just read pmd value and check again its a transparent + * huge or device mapping one and compute corresponding pfn + * values. + */ + pmd = pmd_read_atomic(pmdp); + barrier(); + if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) + goto again; + if (pmd_protnone(pmd)) + return hmm_vma_walk_clear(start, end, walk); + + if (write_fault && !pmd_write(pmd)) + return hmm_vma_walk_clear(start, end, walk); + + pfn = pmd_pfn(pmd) + pte_index(addr); + flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) + pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; + return 0; + } + + if (pmd_bad(*pmdp)) + return hmm_pfns_bad(start, end, walk); + + ptep = pte_offset_map(pmdp, addr); + for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { + pte_t pte = *ptep; + + pfns[i] = 0; + + if (pte_none(pte)) { + pfns[i] = HMM_PFN_EMPTY; + if (hmm_vma_walk->fault) + goto fault; + continue; + } + + if (!pte_present(pte)) { + swp_entry_t entry; + + if (!non_swap_entry(entry)) { + if (hmm_vma_walk->fault) + goto fault; + continue; + } + + entry = pte_to_swp_entry(pte); + + /* + * This is a special swap entry, ignore migration, use + * device and report anything else as error. + */ + if (is_device_private_entry(entry)) { + pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry)); + if (is_write_device_private_entry(entry)) { + pfns[i] |= HMM_PFN_WRITE; + } else if (write_fault) + goto fault; + pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE; + pfns[i] |= flag; + } else if (is_migration_entry(entry)) { + if (hmm_vma_walk->fault) { + pte_unmap(ptep); + hmm_vma_walk->last = addr; + migration_entry_wait(vma->vm_mm, + pmdp, addr); + return -EAGAIN; + } + continue; + } else { + /* Report error for everything else */ + pfns[i] = HMM_PFN_ERROR; + } + continue; + } + + if (write_fault && !pte_write(pte)) + goto fault; + + pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; + pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; + continue; + +fault: + pte_unmap(ptep); + /* Fault all pages in range */ + return hmm_vma_walk_clear(start, end, walk); + } + pte_unmap(ptep - 1); + + return 0; +} + +/* + * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses + * @vma: virtual memory area containing the virtual address range + * @range: used to track snapshot validity + * @start: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @entries: array of hmm_pfn_t: provided by the caller, filled in by function + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success + * + * This snapshots the CPU page table for a range of virtual addresses. Snapshot + * validity is tracked by range struct. See hmm_vma_range_done() for further + * information. + * + * The range struct is initialized here. It tracks the CPU page table, but only + * if the function returns success (0), in which case the caller must then call + * hmm_vma_range_done() to stop CPU page table update tracking on this range. + * + * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS + * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! + */ +int hmm_vma_get_pfns(struct vm_area_struct *vma, + struct hmm_range *range, + unsigned long start, + unsigned long end, + hmm_pfn_t *pfns) +{ + struct hmm_vma_walk hmm_vma_walk; + struct mm_walk mm_walk; + struct hmm *hmm; + + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(pfns, start, end); + return -EINVAL; + } + + /* Sanity check, this really should not happen ! */ + if (start < vma->vm_start || start >= vma->vm_end) + return -EINVAL; + if (end < vma->vm_start || end > vma->vm_end) + return -EINVAL; + + hmm = hmm_register(vma->vm_mm); + if (!hmm) + return -ENOMEM; + /* Caller must have registered a mirror, via hmm_mirror_register() ! */ + if (!hmm->mmu_notifier.ops) + return -EINVAL; + + /* Initialize range to track CPU page table update */ + range->start = start; + range->pfns = pfns; + range->end = end; + spin_lock(&hmm->lock); + range->valid = true; + list_add_rcu(&range->list, &hmm->ranges); + spin_unlock(&hmm->lock); + + hmm_vma_walk.fault = false; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + + walk_page_range(start, end, &mm_walk); + return 0; +} +EXPORT_SYMBOL(hmm_vma_get_pfns); + +/* + * hmm_vma_range_done() - stop tracking change to CPU page table over a range + * @vma: virtual memory area containing the virtual address range + * @range: range being tracked + * Returns: false if range data has been invalidated, true otherwise + * + * Range struct is used to track updates to the CPU page table after a call to + * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done + * using the data, or wants to lock updates to the data it got from those + * functions, it must call the hmm_vma_range_done() function, which will then + * stop tracking CPU page table updates. + * + * Note that device driver must still implement general CPU page table update + * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using + * the mmu_notifier API directly. + * + * CPU page table update tracking done through hmm_range is only temporary and + * to be used while trying to duplicate CPU page table contents for a range of + * virtual addresses. + * + * There are two ways to use this : + * again: + * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * trans = device_build_page_table_update_transaction(pfns); + * device_page_table_lock(); + * if (!hmm_vma_range_done(vma, range)) { + * device_page_table_unlock(); + * goto again; + * } + * device_commit_transaction(trans); + * device_page_table_unlock(); + * + * Or: + * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * device_page_table_lock(); + * hmm_vma_range_done(vma, range); + * device_update_page_table(pfns); + * device_page_table_unlock(); + */ +bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) +{ + unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; + struct hmm *hmm; + + if (range->end <= range->start) { + BUG(); + return false; + } + + hmm = hmm_register(vma->vm_mm); + if (!hmm) { + memset(range->pfns, 0, sizeof(*range->pfns) * npages); + return false; + } + + spin_lock(&hmm->lock); + list_del_rcu(&range->list); + spin_unlock(&hmm->lock); + + return range->valid; +} +EXPORT_SYMBOL(hmm_vma_range_done); + +/* + * hmm_vma_fault() - try to fault some address in a virtual address range + * @vma: virtual memory area containing the virtual address range + * @range: use to track pfns array content validity + * @start: fault range virtual start address (inclusive) + * @end: fault range virtual end address (exclusive) + * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted + * @write: is it a write fault + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) + * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) + * + * This is similar to a regular CPU page fault except that it will not trigger + * any memory migration if the memory being faulted is not accessible by CPUs. + * + * On error, for one virtual address in the range, the function will set the + * hmm_pfn_t error flag for the corresponding pfn entry. + * + * Expected use pattern: + * retry: + * down_read(&mm->mmap_sem); + * // Find vma and address device wants to fault, initialize hmm_pfn_t + * // array accordingly + * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry); + * switch (ret) { + * case -EAGAIN: + * hmm_vma_range_done(vma, range); + * // You might want to rate limit or yield to play nicely, you may + * // also commit any valid pfn in the array assuming that you are + * // getting true from hmm_vma_range_monitor_end() + * goto retry; + * case 0: + * break; + * default: + * // Handle error ! + * up_read(&mm->mmap_sem) + * return; + * } + * // Take device driver lock that serialize device page table update + * driver_lock_device_page_table_update(); + * hmm_vma_range_done(vma, range); + * // Commit pfns we got from hmm_vma_fault() + * driver_unlock_device_page_table_update(); + * up_read(&mm->mmap_sem) + * + * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) + * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! + * + * YOU HAVE BEEN WARNED ! + */ +int hmm_vma_fault(struct vm_area_struct *vma, + struct hmm_range *range, + unsigned long start, + unsigned long end, + hmm_pfn_t *pfns, + bool write, + bool block) +{ + struct hmm_vma_walk hmm_vma_walk; + struct mm_walk mm_walk; + struct hmm *hmm; + int ret; + + /* Sanity check, this really should not happen ! */ + if (start < vma->vm_start || start >= vma->vm_end) + return -EINVAL; + if (end < vma->vm_start || end > vma->vm_end) + return -EINVAL; + + hmm = hmm_register(vma->vm_mm); + if (!hmm) { + hmm_pfns_clear(pfns, start, end); + return -ENOMEM; + } + /* Caller must have registered a mirror using hmm_mirror_register() */ + if (!hmm->mmu_notifier.ops) + return -EINVAL; + + /* Initialize range to track CPU page table update */ + range->start = start; + range->pfns = pfns; + range->end = end; + spin_lock(&hmm->lock); + range->valid = true; + list_add_rcu(&range->list, &hmm->ranges); + spin_unlock(&hmm->lock); + + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(pfns, start, end); + return 0; + } + + hmm_vma_walk.fault = true; + hmm_vma_walk.write = write; + hmm_vma_walk.block = block; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + hmm_vma_walk.last = range->start; + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + + do { + ret = walk_page_range(start, end, &mm_walk); + start = hmm_vma_walk.last; + } while (ret == -EAGAIN); + + if (ret) { + unsigned long i; + + i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; + hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end); + hmm_vma_range_done(vma, range); + } + return ret; +} +EXPORT_SYMBOL(hmm_vma_fault); +#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ + + +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) +struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + + page = alloc_page_vma(GFP_HIGHUSER, vma, addr); + if (!page) + return NULL; + lock_page(page); + return page; +} +EXPORT_SYMBOL(hmm_vma_alloc_locked_page); + + +static void hmm_devmem_ref_release(struct percpu_ref *ref) +{ + struct hmm_devmem *devmem; + + devmem = container_of(ref, struct hmm_devmem, ref); + complete(&devmem->completion); +} + +static void hmm_devmem_ref_exit(void *data) +{ + struct percpu_ref *ref = data; + struct hmm_devmem *devmem; + + devmem = container_of(ref, struct hmm_devmem, ref); + percpu_ref_exit(ref); + devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data); +} + +static void hmm_devmem_ref_kill(void *data) +{ + struct percpu_ref *ref = data; + struct hmm_devmem *devmem; + + devmem = container_of(ref, struct hmm_devmem, ref); + percpu_ref_kill(ref); + wait_for_completion(&devmem->completion); + devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data); +} + +static int hmm_devmem_fault(struct vm_area_struct *vma, + unsigned long addr, + const struct page *page, + unsigned int flags, + pmd_t *pmdp) +{ + struct hmm_devmem *devmem = page->pgmap->data; + + return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); +} + +static void hmm_devmem_free(struct page *page, void *data) +{ + struct hmm_devmem *devmem = data; + + devmem->ops->free(devmem, page); +} + +static DEFINE_MUTEX(hmm_devmem_lock); +static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); + +static void hmm_devmem_radix_release(struct resource *resource) +{ + resource_size_t key, align_start, align_size, align_end; + + align_start = resource->start & ~(PA_SECTION_SIZE - 1); + align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); + align_end = align_start + align_size - 1; + + mutex_lock(&hmm_devmem_lock); + for (key = resource->start; + key <= resource->end; + key += PA_SECTION_SIZE) + radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT); + mutex_unlock(&hmm_devmem_lock); +} + +static void hmm_devmem_release(struct device *dev, void *data) +{ + struct hmm_devmem *devmem = data; + struct resource *resource = devmem->resource; + unsigned long start_pfn, npages; + struct zone *zone; + struct page *page; + + if (percpu_ref_tryget_live(&devmem->ref)) { + dev_WARN(dev, "%s: page mapping is still live!\n", __func__); + percpu_ref_put(&devmem->ref); + } + + /* pages are dead and unused, undo the arch mapping */ + start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT; + npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT; + + page = pfn_to_page(start_pfn); + zone = page_zone(page); + + mem_hotplug_begin(); + if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) + __remove_pages(zone, start_pfn, npages); + else + arch_remove_memory(start_pfn << PAGE_SHIFT, + npages << PAGE_SHIFT); + mem_hotplug_done(); + + hmm_devmem_radix_release(resource); +} + +static struct hmm_devmem *hmm_devmem_find(resource_size_t phys) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT); +} + +static int hmm_devmem_pages_create(struct hmm_devmem *devmem) +{ + resource_size_t key, align_start, align_size, align_end; + struct device *device = devmem->device; + int ret, nid, is_ram; + unsigned long pfn; + + align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); + align_size = ALIGN(devmem->resource->start + + resource_size(devmem->resource), + PA_SECTION_SIZE) - align_start; + + is_ram = region_intersects(align_start, align_size, + IORESOURCE_SYSTEM_RAM, + IORES_DESC_NONE); + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "%s attempted on mixed region %pr\n", + __func__, devmem->resource); + return -ENXIO; + } + if (is_ram == REGION_INTERSECTS) + return -ENXIO; + + if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY) + devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; + else + devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + + devmem->pagemap.res = devmem->resource; + devmem->pagemap.page_fault = hmm_devmem_fault; + devmem->pagemap.page_free = hmm_devmem_free; + devmem->pagemap.dev = devmem->device; + devmem->pagemap.ref = &devmem->ref; + devmem->pagemap.data = devmem; + + mutex_lock(&hmm_devmem_lock); + align_end = align_start + align_size - 1; + for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { + struct hmm_devmem *dup; + + rcu_read_lock(); + dup = hmm_devmem_find(key); + rcu_read_unlock(); + if (dup) { + dev_err(device, "%s: collides with mapping for %s\n", + __func__, dev_name(dup->device)); + mutex_unlock(&hmm_devmem_lock); + ret = -EBUSY; + goto error; + } + ret = radix_tree_insert(&hmm_devmem_radix, + key >> PA_SECTION_SHIFT, + devmem); + if (ret) { + dev_err(device, "%s: failed: %d\n", __func__, ret); + mutex_unlock(&hmm_devmem_lock); + goto error_radix; + } + } + mutex_unlock(&hmm_devmem_lock); + + nid = dev_to_node(device); + if (nid < 0) + nid = numa_mem_id(); + + mem_hotplug_begin(); + /* + * For device private memory we call add_pages() as we only need to + * allocate and initialize struct page for the device memory. More- + * over the device memory is un-accessible thus we do not want to + * create a linear mapping for the memory like arch_add_memory() + * would do. + * + * For device public memory, which is accesible by the CPU, we do + * want the linear mapping and thus use arch_add_memory(). + */ + if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) + ret = arch_add_memory(nid, align_start, align_size, false); + else + ret = add_pages(nid, align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, false); + if (ret) { + mem_hotplug_done(); + goto error_add_memory; + } + move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT); + mem_hotplug_done(); + + for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { + struct page *page = pfn_to_page(pfn); + + page->pgmap = &devmem->pagemap; + } + return 0; + +error_add_memory: + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); +error_radix: + hmm_devmem_radix_release(devmem->resource); +error: + return ret; +} + +static int hmm_devmem_match(struct device *dev, void *data, void *match_data) +{ + struct hmm_devmem *devmem = data; + + return devmem->resource == match_data; +} + +static void hmm_devmem_pages_remove(struct hmm_devmem *devmem) +{ + devres_release(devmem->device, &hmm_devmem_release, + &hmm_devmem_match, devmem->resource); +} + +/* + * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory + * + * @ops: memory event device driver callback (see struct hmm_devmem_ops) + * @device: device struct to bind the resource too + * @size: size in bytes of the device memory to add + * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise + * + * This function first finds an empty range of physical address big enough to + * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which + * in turn allocates struct pages. It does not do anything beyond that; all + * events affecting the memory will go through the various callbacks provided + * by hmm_devmem_ops struct. + * + * Device driver should call this function during device initialization and + * is then responsible of memory management. HMM only provides helpers. + */ +struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, + struct device *device, + unsigned long size) +{ + struct hmm_devmem *devmem; + resource_size_t addr; + int ret; + + static_branch_enable(&device_private_key); + + devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), + GFP_KERNEL, dev_to_node(device)); + if (!devmem) + return ERR_PTR(-ENOMEM); + + init_completion(&devmem->completion); + devmem->pfn_first = -1UL; + devmem->pfn_last = -1UL; + devmem->resource = NULL; + devmem->device = device; + devmem->ops = ops; + + ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, + 0, GFP_KERNEL); + if (ret) + goto error_percpu_ref; + + ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); + if (ret) + goto error_devm_add_action; + + size = ALIGN(size, PA_SECTION_SIZE); + addr = min((unsigned long)iomem_resource.end, + (1UL << MAX_PHYSMEM_BITS) - 1); + addr = addr - size + 1UL; + + /* + * FIXME add a new helper to quickly walk resource tree and find free + * range + * + * FIXME what about ioport_resource resource ? + */ + for (; addr > size && addr >= iomem_resource.start; addr -= size) { + ret = region_intersects(addr, size, 0, IORES_DESC_NONE); + if (ret != REGION_DISJOINT) + continue; + + devmem->resource = devm_request_mem_region(device, addr, size, + dev_name(device)); + if (!devmem->resource) { + ret = -ENOMEM; + goto error_no_resource; + } + break; + } + if (!devmem->resource) { + ret = -ERANGE; + goto error_no_resource; + } + + devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; + devmem->pfn_last = devmem->pfn_first + + (resource_size(devmem->resource) >> PAGE_SHIFT); + + ret = hmm_devmem_pages_create(devmem); + if (ret) + goto error_pages; + + devres_add(device, devmem); + + ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); + if (ret) { + hmm_devmem_remove(devmem); + return ERR_PTR(ret); + } + + return devmem; + +error_pages: + devm_release_mem_region(device, devmem->resource->start, + resource_size(devmem->resource)); +error_no_resource: +error_devm_add_action: + hmm_devmem_ref_kill(&devmem->ref); + hmm_devmem_ref_exit(&devmem->ref); +error_percpu_ref: + devres_free(devmem); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(hmm_devmem_add); + +struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, + struct device *device, + struct resource *res) +{ + struct hmm_devmem *devmem; + int ret; + + if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) + return ERR_PTR(-EINVAL); + + static_branch_enable(&device_private_key); + + devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), + GFP_KERNEL, dev_to_node(device)); + if (!devmem) + return ERR_PTR(-ENOMEM); + + init_completion(&devmem->completion); + devmem->pfn_first = -1UL; + devmem->pfn_last = -1UL; + devmem->resource = res; + devmem->device = device; + devmem->ops = ops; + + ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, + 0, GFP_KERNEL); + if (ret) + goto error_percpu_ref; + + ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); + if (ret) + goto error_devm_add_action; + + + devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; + devmem->pfn_last = devmem->pfn_first + + (resource_size(devmem->resource) >> PAGE_SHIFT); + + ret = hmm_devmem_pages_create(devmem); + if (ret) + goto error_devm_add_action; + + devres_add(device, devmem); + + ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); + if (ret) { + hmm_devmem_remove(devmem); + return ERR_PTR(ret); + } + + return devmem; + +error_devm_add_action: + hmm_devmem_ref_kill(&devmem->ref); + hmm_devmem_ref_exit(&devmem->ref); +error_percpu_ref: + devres_free(devmem); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(hmm_devmem_add_resource); + +/* + * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE) + * + * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory + * + * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf + * of the device driver. It will free struct page and remove the resource that + * reserved the physical address range for this device memory. + */ +void hmm_devmem_remove(struct hmm_devmem *devmem) +{ + resource_size_t start, size; + struct device *device; + bool cdm = false; + + if (!devmem) + return; + + device = devmem->device; + start = devmem->resource->start; + size = resource_size(devmem->resource); + + cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY; + hmm_devmem_ref_kill(&devmem->ref); + hmm_devmem_ref_exit(&devmem->ref); + hmm_devmem_pages_remove(devmem); + + if (!cdm) + devm_release_mem_region(device, start, size); +} +EXPORT_SYMBOL(hmm_devmem_remove); + +/* + * A device driver that wants to handle multiple devices memory through a + * single fake device can use hmm_device to do so. This is purely a helper + * and it is not needed to make use of any HMM functionality. + */ +#define HMM_DEVICE_MAX 256 + +static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); +static DEFINE_SPINLOCK(hmm_device_lock); +static struct class *hmm_device_class; +static dev_t hmm_device_devt; + +static void hmm_device_release(struct device *device) +{ + struct hmm_device *hmm_device; + + hmm_device = container_of(device, struct hmm_device, device); + spin_lock(&hmm_device_lock); + clear_bit(hmm_device->minor, hmm_device_mask); + spin_unlock(&hmm_device_lock); + + kfree(hmm_device); +} + +struct hmm_device *hmm_device_new(void *drvdata) +{ + struct hmm_device *hmm_device; + + hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); + if (!hmm_device) + return ERR_PTR(-ENOMEM); + + spin_lock(&hmm_device_lock); + hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); + if (hmm_device->minor >= HMM_DEVICE_MAX) { + spin_unlock(&hmm_device_lock); + kfree(hmm_device); + return ERR_PTR(-EBUSY); + } + set_bit(hmm_device->minor, hmm_device_mask); + spin_unlock(&hmm_device_lock); + + dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); + hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), + hmm_device->minor); + hmm_device->device.release = hmm_device_release; + dev_set_drvdata(&hmm_device->device, drvdata); + hmm_device->device.class = hmm_device_class; + device_initialize(&hmm_device->device); + + return hmm_device; +} +EXPORT_SYMBOL(hmm_device_new); + +void hmm_device_put(struct hmm_device *hmm_device) +{ + put_device(&hmm_device->device); +} +EXPORT_SYMBOL(hmm_device_put); + +static int __init hmm_init(void) +{ + int ret; + + ret = alloc_chrdev_region(&hmm_device_devt, 0, + HMM_DEVICE_MAX, + "hmm_device"); + if (ret) + return ret; + + hmm_device_class = class_create(THIS_MODULE, "hmm_device"); + if (IS_ERR(hmm_device_class)) { + unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); + return PTR_ERR(hmm_device_class); + } + return 0; +} + +device_initcall(hmm_init); +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 90731e3b7e58..269b5df58543 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -328,7 +328,7 @@ static struct attribute *hugepage_attr[] = { NULL, }; -static struct attribute_group hugepage_attr_group = { +static const struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, }; @@ -567,7 +567,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, goto release; } - clear_huge_page(page, haddr, HPAGE_PMD_NR); + clear_huge_page(page, vmf->address, HPAGE_PMD_NR); /* * The memory barrier inside __SetPageUptodate makes sure that * clear_huge_page writes become visible before the set_pmd_at() @@ -928,6 +928,25 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (unlikely(is_swap_pmd(pmd))) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + if (is_write_migration_entry(entry)) { + make_migration_entry_read(&entry); + pmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + ret = 0; + goto out_unlock; + } +#endif + if (unlikely(!pmd_trans_huge(pmd))) { pte_free(dst_mm, pgtable); goto out_unlock; @@ -1240,15 +1259,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) * We can only reuse the page if nobody else maps the huge page or it's * part. */ - if (page_trans_huge_mapcount(page, NULL) == 1) { + if (!trylock_page(page)) { + get_page(page); + spin_unlock(vmf->ptl); + lock_page(page); + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + unlock_page(page); + put_page(page); + goto out_unlock; + } + put_page(page); + } + if (reuse_swap_page(page, NULL)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); ret |= VM_FAULT_WRITE; + unlock_page(page); goto out_unlock; } + unlock_page(page); get_page(page); spin_unlock(vmf->ptl); alloc: @@ -1291,7 +1324,7 @@ alloc: count_vm_event(THP_FAULT_ALLOC); if (!page) - clear_huge_page(new_page, haddr, HPAGE_PMD_NR); + clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); else copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); @@ -1510,8 +1543,15 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) } /* - * The page_table_lock above provides a memory barrier - * with change_protection_range. + * Since we took the NUMA fault, we must have observed the !accessible + * bit. Make sure all other CPUs agree with that, to avoid them + * modifying the page we're about to migrate. + * + * Must be done under PTL such that we'll observe the relevant + * inc_tlb_flush_pending(). + * + * We are not sure a pending tlb flush here is for a huge page + * mapping or not. Hence use the tlb range variant */ if (mm_tlb_flush_pending(vma->vm_mm)) flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); @@ -1521,6 +1561,7 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) * and access rights restored. */ spin_unlock(vmf->ptl); + migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, vmf->pmd, pmd, vmf->address, page, target_nid); if (migrated) { @@ -1577,6 +1618,12 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_huge_zero_pmd(orig_pmd)) goto out; + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto out; + } + page = pmd_page(orig_pmd); /* * If other processes are mapping this page, we couldn't discard @@ -1662,10 +1709,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, spin_unlock(ptl); tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { - struct page *page = pmd_page(orig_pmd); - page_remove_rmap(page, true); - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - VM_BUG_ON_PAGE(!PageHead(page), page); + struct page *page = NULL; + int flush_needed = 1; + + if (pmd_present(orig_pmd)) { + page = pmd_page(orig_pmd); + page_remove_rmap(page, true); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + VM_BUG_ON_PAGE(!PageHead(page), page); + } else if (thp_migration_supported()) { + swp_entry_t entry; + + VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); + entry = pmd_to_swp_entry(orig_pmd); + page = pfn_to_page(swp_offset(entry)); + flush_needed = 0; + } else + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + if (PageAnon(page)) { zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); @@ -1674,8 +1735,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); } + spin_unlock(ptl); - tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); + if (flush_needed) + tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); } return 1; } @@ -1695,6 +1758,17 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, } #endif +static pmd_t move_soft_dirty_pmd(pmd_t pmd) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + if (unlikely(is_pmd_migration_entry(pmd))) + pmd = pmd_swp_mksoft_dirty(pmd); + else if (pmd_present(pmd)) + pmd = pmd_mksoft_dirty(pmd); +#endif + return pmd; +} + bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) @@ -1737,7 +1811,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); } - set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); + pmd = move_soft_dirty_pmd(pmd); + set_pmd_at(mm, new_addr, new_pmd, pmd); if (new_ptl != old_ptl) spin_unlock(new_ptl); if (force_flush) @@ -1772,6 +1847,27 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, preserve_write = prot_numa && pmd_write(*pmd); ret = 1; +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (is_swap_pmd(*pmd)) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); + + VM_BUG_ON(!is_pmd_migration_entry(*pmd)); + if (is_write_migration_entry(entry)) { + pmd_t newpmd; + /* + * A protection check is difficult so + * just be safe and disable write + */ + make_migration_entry_read(&entry); + newpmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*pmd)) + newpmd = pmd_swp_mksoft_dirty(newpmd); + set_pmd_at(mm, addr, pmd, newpmd); + } + goto unlock; + } +#endif + /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and @@ -1837,7 +1933,8 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || + pmd_devmap(*pmd))) return ptl; spin_unlock(ptl); return NULL; @@ -1955,14 +2052,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t _pmd; - bool young, write, dirty, soft_dirty; + bool young, write, dirty, soft_dirty, pmd_migration = false; unsigned long addr; int i; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); - VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); + VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) + && !pmd_devmap(*pmd)); count_vm_event(THP_SPLIT_PMD); @@ -1987,7 +2085,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } - page = pmd_page(*pmd); +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + pmd_migration = is_pmd_migration_entry(*pmd); + if (pmd_migration) { + swp_entry_t entry; + + entry = pmd_to_swp_entry(*pmd); + page = pfn_to_page(swp_offset(entry)); + } else +#endif + page = pmd_page(*pmd); VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); write = pmd_write(*pmd); @@ -2006,7 +2113,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * transferred to avoid any possibility of altering * permissions across VMAs. */ - if (freeze) { + if (freeze || pmd_migration) { swp_entry_t swp_entry; swp_entry = make_migration_entry(page + i, write); entry = swp_entry_to_pte(swp_entry); @@ -2105,7 +2212,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, page = pmd_page(*pmd); if (PageMlocked(page)) clear_page_mlock(page); - } else if (!pmd_devmap(*pmd)) + } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: @@ -2188,7 +2295,7 @@ static void freeze_page(struct page *page) VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) - ttu_flags |= TTU_MIGRATION; + ttu_flags |= TTU_SPLIT_FREEZE; unmap_success = try_to_unmap(page, ttu_flags); VM_BUG_ON_PAGE(!unmap_success, page); @@ -2459,6 +2566,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page); + if (PageWriteback(page)) + return -EBUSY; + if (PageAnon(head)) { /* * The caller does not necessarily hold an mmap_sem that would @@ -2536,7 +2646,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) __dec_node_page_state(page, NR_SHMEM_THPS); spin_unlock(&pgdata->split_queue_lock); __split_huge_page(page, list, flags); - ret = 0; + if (PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + ret = split_swap_cluster(entry); + } else + ret = 0; } else { if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { pr_alert("total_mapcount: %u, page_count(): %u\n", @@ -2715,3 +2830,66 @@ static int __init split_huge_pages_debugfs(void) } late_initcall(split_huge_pages_debugfs); #endif + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, + struct page *page) +{ + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long address = pvmw->address; + pmd_t pmdval; + swp_entry_t entry; + pmd_t pmdswp; + + if (!(pvmw->pmd && !pvmw->pte)) + return; + + mmu_notifier_invalidate_range_start(mm, address, + address + HPAGE_PMD_SIZE); + + flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); + pmdval = *pvmw->pmd; + pmdp_invalidate(vma, address, pvmw->pmd); + if (pmd_dirty(pmdval)) + set_page_dirty(page); + entry = make_migration_entry(page, pmd_write(pmdval)); + pmdswp = swp_entry_to_pmd(entry); + if (pmd_soft_dirty(pmdval)) + pmdswp = pmd_swp_mksoft_dirty(pmdswp); + set_pmd_at(mm, address, pvmw->pmd, pmdswp); + page_remove_rmap(page, true); + put_page(page); + + mmu_notifier_invalidate_range_end(mm, address, + address + HPAGE_PMD_SIZE); +} + +void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) +{ + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long address = pvmw->address; + unsigned long mmun_start = address & HPAGE_PMD_MASK; + pmd_t pmde; + swp_entry_t entry; + + if (!(pvmw->pmd && !pvmw->pte)) + return; + + entry = pmd_to_swp_entry(*pvmw->pmd); + get_page(new); + pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot)); + if (pmd_swp_soft_dirty(*pvmw->pmd)) + pmde = pmd_mksoft_dirty(pmde); + if (is_write_migration_entry(entry)) + pmde = maybe_pmd_mkwrite(pmde, vma); + + flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); + page_add_anon_rmap(new, vma, mmun_start, true); + set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); + if (vma->vm_flags & VM_LOCKED) + mlock_vma_page(new); + update_mmu_cache_pmd(vma, address, pvmw->pmd); +} +#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 31e207cb399b..424b0ef08a60 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1066,11 +1066,11 @@ static void free_gigantic_page(struct page *page, unsigned int order) } static int __alloc_gigantic_page(unsigned long start_pfn, - unsigned long nr_pages) + unsigned long nr_pages, gfp_t gfp_mask) { unsigned long end_pfn = start_pfn + nr_pages; return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - GFP_KERNEL); + gfp_mask); } static bool pfn_range_valid_gigantic(struct zone *z, @@ -1108,19 +1108,24 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, unsigned int order) +static struct page *alloc_gigantic_page(int nid, struct hstate *h) { + unsigned int order = huge_page_order(h); unsigned long nr_pages = 1 << order; unsigned long ret, pfn, flags; - struct zone *z; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + gfp_t gfp_mask; - z = NODE_DATA(nid)->node_zones; - for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { - spin_lock_irqsave(&z->lock, flags); + gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + zonelist = node_zonelist(nid, gfp_mask); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) { + spin_lock_irqsave(&zone->lock, flags); - pfn = ALIGN(z->zone_start_pfn, nr_pages); - while (zone_spans_last_pfn(z, pfn, nr_pages)) { - if (pfn_range_valid_gigantic(z, pfn, nr_pages)) { + pfn = ALIGN(zone->zone_start_pfn, nr_pages); + while (zone_spans_last_pfn(zone, pfn, nr_pages)) { + if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) { /* * We release the zone lock here because * alloc_contig_range() will also lock the zone @@ -1128,16 +1133,16 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order) * spinning on this lock, it may win the race * and cause alloc_contig_range() to fail... */ - spin_unlock_irqrestore(&z->lock, flags); - ret = __alloc_gigantic_page(pfn, nr_pages); + spin_unlock_irqrestore(&zone->lock, flags); + ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask); if (!ret) return pfn_to_page(pfn); - spin_lock_irqsave(&z->lock, flags); + spin_lock_irqsave(&zone->lock, flags); } pfn += nr_pages; } - spin_unlock_irqrestore(&z->lock, flags); + spin_unlock_irqrestore(&zone->lock, flags); } return NULL; @@ -1150,7 +1155,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) { struct page *page; - page = alloc_gigantic_page(nid, huge_page_order(h)); + page = alloc_gigantic_page(nid, h); if (page) { prep_compound_gigantic_page(page, huge_page_order(h)); prep_new_huge_page(h, page, nid); @@ -2083,7 +2088,9 @@ struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, return page; } -int __weak alloc_bootmem_huge_page(struct hstate *h) +int alloc_bootmem_huge_page(struct hstate *h) + __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); +int __alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; int nr_nodes, node; @@ -2569,13 +2576,13 @@ static struct attribute *hstate_attrs[] = { NULL, }; -static struct attribute_group hstate_attr_group = { +static const struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, struct kobject **hstate_kobjs, - struct attribute_group *hstate_attr_group) + const struct attribute_group *hstate_attr_group) { int retval; int hi = hstate_index(h); @@ -2633,7 +2640,7 @@ static struct attribute *per_node_hstate_attrs[] = { NULL, }; -static struct attribute_group per_node_hstate_attr_group = { +static const struct attribute_group per_node_hstate_attr_group = { .attrs = per_node_hstate_attrs, }; @@ -4600,6 +4607,15 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } +/* + * huge_pte_offset() - Walk the page table to resolve the hugepage + * entry at address @addr + * + * Return: Pointer to page table or swap entry (PUD or PMD) for + * address @addr, or NULL if a p*d_none() entry is encountered and the + * size @sz doesn't match the hugepage size at this level of the page + * table. + */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { @@ -4614,13 +4630,22 @@ pte_t *huge_pte_offset(struct mm_struct *mm, p4d = p4d_offset(pgd, addr); if (!p4d_present(*p4d)) return NULL; + pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) + if (sz != PUD_SIZE && pud_none(*pud)) return NULL; - if (pud_huge(*pud)) + /* hugepage or swap? */ + if (pud_huge(*pud) || !pud_present(*pud)) return (pte_t *)pud; + pmd = pmd_offset(pud, addr); - return (pte_t *) pmd; + if (sz != PMD_SIZE && pmd_none(*pmd)) + return NULL; + /* hugepage or swap? */ + if (pmd_huge(*pmd) || !pmd_present(*pmd)) + return (pte_t *)pmd; + + return NULL; } #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ diff --git a/mm/internal.h b/mm/internal.h index 4ef49fc55e58..1df011f62480 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -480,6 +480,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, /* Mask to get the watermark bits */ #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) +/* + * Only MMU archs have async oom victim reclaim - aka oom_reaper so we + * cannot assume a reduced access to memory reserves is sufficient for + * !MMU + */ +#ifdef CONFIG_MMU +#define ALLOC_OOM 0x08 +#else +#define ALLOC_OOM ALLOC_NO_WATERMARKS +#endif + #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ @@ -525,4 +536,5 @@ static inline bool is_migrate_highatomic_page(struct page *page) return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; } +void setup_zone_pageset(struct zone *zone); #endif /* __MM_INTERNAL_H */ diff --git a/mm/interval_tree.c b/mm/interval_tree.c index f2c2492681bf..b47664358796 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -28,7 +28,7 @@ INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, /* Insert node immediately after prev in the interval tree */ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, - struct rb_root *root) + struct rb_root_cached *root) { struct rb_node **link; struct vm_area_struct *parent; @@ -55,7 +55,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, node->shared.rb_subtree_last = last; rb_link_node(&node->shared.rb, &parent->shared.rb, link); - rb_insert_augmented(&node->shared.rb, root, + rb_insert_augmented(&node->shared.rb, &root->rb_root, &vma_interval_tree_augment); } @@ -74,7 +74,7 @@ INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, static inline, __anon_vma_interval_tree) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, - struct rb_root *root) + struct rb_root_cached *root) { #ifdef CONFIG_DEBUG_VM_RB node->cached_vma_start = avc_start_pgoff(node); @@ -84,13 +84,13 @@ void anon_vma_interval_tree_insert(struct anon_vma_chain *node, } void anon_vma_interval_tree_remove(struct anon_vma_chain *node, - struct rb_root *root) + struct rb_root_cached *root) { __anon_vma_interval_tree_remove(node, root); } struct anon_vma_chain * -anon_vma_interval_tree_iter_first(struct rb_root *root, +anon_vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long first, unsigned long last) { return __anon_vma_interval_tree_iter_first(root, first, last); diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index ca11bc4ce205..6f319fb81718 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -267,13 +267,13 @@ static void check_memory_region(unsigned long addr, check_memory_region_inline(addr, size, write, ret_ip); } -void kasan_check_read(const void *p, unsigned int size) +void kasan_check_read(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, false, _RET_IP_); } EXPORT_SYMBOL(kasan_check_read); -void kasan_check_write(const void *p, unsigned int size) +void kasan_check_write(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, true, _RET_IP_); } @@ -3043,7 +3043,7 @@ static struct attribute *ksm_attrs[] = { NULL, }; -static struct attribute_group ksm_attr_group = { +static const struct attribute_group ksm_attr_group = { .attrs = ksm_attrs, .name = "ksm", }; diff --git a/mm/madvise.c b/mm/madvise.c index 23ed525bc2bc..21261ff0466f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma, } new_flags &= ~VM_DONTCOPY; break; + case MADV_WIPEONFORK: + /* MADV_WIPEONFORK is only supported on anonymous memory. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) { + error = -EINVAL; + goto out; + } + new_flags |= VM_WIPEONFORK; + break; + case MADV_KEEPONFORK: + new_flags &= ~VM_WIPEONFORK; + break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; @@ -344,7 +355,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; } - page = vm_normal_page(vma, addr, ptent); + page = _vm_normal_page(vma, addr, ptent, true); if (!page) continue; @@ -613,6 +624,7 @@ static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { struct page *page; + struct zone *zone; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -646,6 +658,11 @@ static int madvise_inject_error(int behavior, if (ret) return ret; } + + /* Ensure that all poisoned pages are removed from per-cpu lists */ + for_each_populated_zone(zone) + drain_all_pages(zone); + return 0; } #endif @@ -690,6 +707,8 @@ madvise_behavior_valid(int behavior) #endif case MADV_DONTDUMP: case MADV_DODUMP: + case MADV_WIPEONFORK: + case MADV_KEEPONFORK: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e09741af816f..15af3da5af02 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -119,6 +119,7 @@ static const char *const mem_cgroup_lru_names[] = { struct mem_cgroup_tree_per_node { struct rb_root rb_root; + struct rb_node *rb_rightmost; spinlock_t lock; }; @@ -386,6 +387,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; struct mem_cgroup_per_node *mz_node; + bool rightmost = true; if (mz->on_tree) return; @@ -397,8 +399,11 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, parent = *p; mz_node = rb_entry(parent, struct mem_cgroup_per_node, tree_node); - if (mz->usage_in_excess < mz_node->usage_in_excess) + if (mz->usage_in_excess < mz_node->usage_in_excess) { p = &(*p)->rb_left; + rightmost = false; + } + /* * We can't avoid mem cgroups that are over their soft * limit by the same amount @@ -406,6 +411,10 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, else if (mz->usage_in_excess >= mz_node->usage_in_excess) p = &(*p)->rb_right; } + + if (rightmost) + mctz->rb_rightmost = &mz->tree_node; + rb_link_node(&mz->tree_node, parent, p); rb_insert_color(&mz->tree_node, &mctz->rb_root); mz->on_tree = true; @@ -416,6 +425,10 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, { if (!mz->on_tree) return; + + if (&mz->tree_node == mctz->rb_rightmost) + mctz->rb_rightmost = rb_prev(&mz->tree_node); + rb_erase(&mz->tree_node, &mctz->rb_root); mz->on_tree = false; } @@ -496,16 +509,15 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) static struct mem_cgroup_per_node * __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) { - struct rb_node *rightmost = NULL; struct mem_cgroup_per_node *mz; retry: mz = NULL; - rightmost = rb_last(&mctz->rb_root); - if (!rightmost) + if (!mctz->rb_rightmost) goto done; /* Nothing to reclaim from */ - mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node); + mz = rb_entry(mctz->rb_rightmost, + struct mem_cgroup_per_node, tree_node); /* * Remove the node now but someone else can add it back, * we will to add it back at the end of reclaim to its correct @@ -550,10 +562,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) * value, and reading all cpu value can be performance bottleneck in some * common workload, threshold and synchronization as vmstat[] should be * implemented. + * + * The parameter idx can be of type enum memcg_event_item or vm_event_item. */ static unsigned long memcg_sum_events(struct mem_cgroup *memcg, - enum memcg_event_item event) + int event) { unsigned long val = 0; int cpu; @@ -917,7 +931,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&iter->css, &it); + css_task_iter_start(&iter->css, 0, &it); while (!ret && (task = css_task_iter_next(&it))) ret = fn(task, arg); css_task_iter_end(&it); @@ -1790,6 +1804,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) } stock->nr_pages += nr_pages; + if (stock->nr_pages > CHARGE_BATCH) + drain_stock(stock); + local_irq_restore(flags); } @@ -1915,7 +1932,7 @@ retry: * bypass the last charges so that they can exit quickly and * free their memory. */ - if (unlikely(test_thread_flag(TIF_MEMDIE) || + if (unlikely(tsk_is_oom_victim(current) || fatal_signal_pending(current) || current->flags & PF_EXITING)) goto force; @@ -4319,6 +4336,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); + memcg->low = 0; + memcg_offline_kmem(memcg); wb_memcg_offline(memcg); @@ -4410,12 +4429,13 @@ enum mc_target_type { MC_TARGET_NONE = 0, MC_TARGET_PAGE, MC_TARGET_SWAP, + MC_TARGET_DEVICE, }; static struct page *mc_handle_present_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent) { - struct page *page = vm_normal_page(vma, addr, ptent); + struct page *page = _vm_normal_page(vma, addr, ptent, true); if (!page || !page_mapped(page)) return NULL; @@ -4432,7 +4452,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, return page; } -#ifdef CONFIG_SWAP +#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, pte_t ptent, swp_entry_t *entry) { @@ -4441,6 +4461,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) return NULL; + + /* + * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to + * a device and because they are not accessible by CPU they are store + * as special swap entry in the CPU page table. + */ + if (is_device_private_entry(ent)) { + page = device_private_entry_to_page(ent); + /* + * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have + * a refcount of 1 when free (unlike normal page) + */ + if (!page_ref_add_unless(page, 1, 1)) + return NULL; + return page; + } + /* * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. @@ -4601,6 +4638,13 @@ out: * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC + * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). + * For now we such page is charge like a regular page would be as for all + * intent and purposes it is just special memory taking the place of a + * regular page. + * + * See Documentations/vm/hmm.txt and include/linux/hmm.h * * Called with pte lock held. */ @@ -4629,14 +4673,20 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; + if (is_device_private_page(page) || + is_device_public_page(page)) + ret = MC_TARGET_DEVICE; if (target) target->page = page; } if (!ret || !target) put_page(page); } - /* There is a swap entry and a page doesn't exist or isn't charged */ - if (ent.val && !ret && + /* + * There is a swap entry and a page doesn't exist or isn't charged. + * But we cannot move a tail-page in a THP. + */ + if (ent.val && !ret && (!page || !PageTransCompound(page)) && mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) @@ -4647,8 +4697,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* - * We don't consider swapping or file mapped pages because THP does not - * support them for now. + * We don't consider PMD mapped swapping or file mapped pages because THP does + * not support them for now. * Caller should make sure that pmd_trans_huge(pmd) is true. */ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, @@ -4657,6 +4707,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, struct page *page = NULL; enum mc_target_type ret = MC_TARGET_NONE; + if (unlikely(is_swap_pmd(pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(pmd)); + return ret; + } page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!(mc.flags & MOVE_ANON)) @@ -4688,6 +4743,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { + /* + * Note their can not be MC_TARGET_DEVICE for now as we do not + * support transparent huge page with MEMORY_DEVICE_PUBLIC or + * MEMORY_DEVICE_PRIVATE but this might change. + */ if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; spin_unlock(ptl); @@ -4903,6 +4963,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, putback_lru_page(page); } put_page(page); + } else if (target_type == MC_TARGET_DEVICE) { + page = target.page; + if (!mem_cgroup_move_account(page, true, + mc.from, mc.to)) { + mc.precharge -= HPAGE_PMD_NR; + mc.moved_charge += HPAGE_PMD_NR; + } + put_page(page); } spin_unlock(ptl); return 0; @@ -4914,12 +4982,16 @@ retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); + bool device = false; swp_entry_t ent; if (!mc.precharge) break; switch (get_mctgt_type(vma, addr, ptent, &target)) { + case MC_TARGET_DEVICE: + device = true; + /* fall through */ case MC_TARGET_PAGE: page = target.page; /* @@ -4930,7 +5002,7 @@ retry: */ if (PageTransCompound(page)) goto put; - if (isolate_lru_page(page)) + if (!device && isolate_lru_page(page)) goto put; if (!mem_cgroup_move_account(page, false, mc.from, mc.to)) { @@ -4938,7 +5010,8 @@ retry: /* we uncharge from mc.from later. */ mc.moved_charge++; } - putback_lru_page(page); + if (!device) + putback_lru_page(page); put: /* get_mctgt_type() gets the page */ put_page(page); break; @@ -5423,7 +5496,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, * in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); - if (page->mem_cgroup) + if (compound_head(page)->mem_cgroup) goto out; if (do_swap_account) { @@ -5528,48 +5601,102 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, cancel_charge(memcg, nr_pages); } -static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, - unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_kmem, unsigned long nr_huge, - unsigned long nr_shmem, struct page *dummy_page) +struct uncharge_gather { + struct mem_cgroup *memcg; + unsigned long pgpgout; + unsigned long nr_anon; + unsigned long nr_file; + unsigned long nr_kmem; + unsigned long nr_huge; + unsigned long nr_shmem; + struct page *dummy_page; +}; + +static inline void uncharge_gather_clear(struct uncharge_gather *ug) +{ + memset(ug, 0, sizeof(*ug)); +} + +static void uncharge_batch(const struct uncharge_gather *ug) { - unsigned long nr_pages = nr_anon + nr_file + nr_kmem; + unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem; unsigned long flags; - if (!mem_cgroup_is_root(memcg)) { - page_counter_uncharge(&memcg->memory, nr_pages); + if (!mem_cgroup_is_root(ug->memcg)) { + page_counter_uncharge(&ug->memcg->memory, nr_pages); if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem) - page_counter_uncharge(&memcg->kmem, nr_kmem); - memcg_oom_recover(memcg); + page_counter_uncharge(&ug->memcg->memsw, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) + page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); + memcg_oom_recover(ug->memcg); } local_irq_save(flags); - __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon); - __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file); - __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge); - __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem); - __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout); - __this_cpu_add(memcg->stat->nr_page_events, nr_pages); - memcg_check_events(memcg, dummy_page); + __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon); + __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file); + __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge); + __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem); + __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout); + __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages); + memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); - if (!mem_cgroup_is_root(memcg)) - css_put_many(&memcg->css, nr_pages); + if (!mem_cgroup_is_root(ug->memcg)) + css_put_many(&ug->memcg->css, nr_pages); +} + +static void uncharge_page(struct page *page, struct uncharge_gather *ug) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); + + if (!page->mem_cgroup) + return; + + /* + * Nobody should be changing or seriously looking at + * page->mem_cgroup at this point, we have fully + * exclusive access to the page. + */ + + if (ug->memcg != page->mem_cgroup) { + if (ug->memcg) { + uncharge_batch(ug); + uncharge_gather_clear(ug); + } + ug->memcg = page->mem_cgroup; + } + + if (!PageKmemcg(page)) { + unsigned int nr_pages = 1; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + ug->nr_huge += nr_pages; + } + if (PageAnon(page)) + ug->nr_anon += nr_pages; + else { + ug->nr_file += nr_pages; + if (PageSwapBacked(page)) + ug->nr_shmem += nr_pages; + } + ug->pgpgout++; + } else { + ug->nr_kmem += 1 << compound_order(page); + __ClearPageKmemcg(page); + } + + ug->dummy_page = page; + page->mem_cgroup = NULL; } static void uncharge_list(struct list_head *page_list) { - struct mem_cgroup *memcg = NULL; - unsigned long nr_shmem = 0; - unsigned long nr_anon = 0; - unsigned long nr_file = 0; - unsigned long nr_huge = 0; - unsigned long nr_kmem = 0; - unsigned long pgpgout = 0; + struct uncharge_gather ug; struct list_head *next; - struct page *page; + + uncharge_gather_clear(&ug); /* * Note that the list can be a single page->lru; hence the @@ -5577,57 +5704,16 @@ static void uncharge_list(struct list_head *page_list) */ next = page_list->next; do { + struct page *page; + page = list_entry(next, struct page, lru); next = page->lru.next; - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); - - if (!page->mem_cgroup) - continue; - - /* - * Nobody should be changing or seriously looking at - * page->mem_cgroup at this point, we have fully - * exclusive access to the page. - */ - - if (memcg != page->mem_cgroup) { - if (memcg) { - uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_kmem, nr_huge, nr_shmem, page); - pgpgout = nr_anon = nr_file = nr_kmem = 0; - nr_huge = nr_shmem = 0; - } - memcg = page->mem_cgroup; - } - - if (!PageKmemcg(page)) { - unsigned int nr_pages = 1; - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - nr_huge += nr_pages; - } - if (PageAnon(page)) - nr_anon += nr_pages; - else { - nr_file += nr_pages; - if (PageSwapBacked(page)) - nr_shmem += nr_pages; - } - pgpgout++; - } else { - nr_kmem += 1 << compound_order(page); - __ClearPageKmemcg(page); - } - - page->mem_cgroup = NULL; + uncharge_page(page, &ug); } while (next != page_list); - if (memcg) - uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_kmem, nr_huge, nr_shmem, page); + if (ug.memcg) + uncharge_batch(&ug); } /** @@ -5639,6 +5725,8 @@ static void uncharge_list(struct list_head *page_list) */ void mem_cgroup_uncharge(struct page *page) { + struct uncharge_gather ug; + if (mem_cgroup_disabled()) return; @@ -5646,8 +5734,9 @@ void mem_cgroup_uncharge(struct page *page) if (!page->mem_cgroup) return; - INIT_LIST_HEAD(&page->lru); - uncharge_list(&page->lru); + uncharge_gather_clear(&ug); + uncharge_page(page, &ug); + uncharge_batch(&ug); } /** @@ -5812,8 +5901,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); - page_counter_uncharge(&memcg->memory, nr_pages); - css_put_many(&memcg->css, nr_pages); + refill_stock(memcg, nr_pages); } static int __init cgroup_memory(char *s) @@ -5869,6 +5957,7 @@ static int __init mem_cgroup_init(void) node_online(node) ? node : NUMA_NO_NODE); rtpn->rb_root = RB_ROOT; + rtpn->rb_rightmost = NULL; spin_lock_init(&rtpn->lock); soft_limit_tree.rb_tree_per_node[node] = rtpn; } @@ -5906,6 +5995,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; + unsigned int nr_entries; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5926,19 +6016,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * ancestor for the swap instead and transfer the memory+swap charge. */ swap_memcg = mem_cgroup_id_get_online(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1); + nr_entries = hpage_nr_pages(page); + /* Get references for the tail pages, too */ + if (nr_entries > 1) + mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), + nr_entries); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(swap_memcg, 1); + mem_cgroup_swap_statistics(swap_memcg, nr_entries); page->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) - page_counter_uncharge(&memcg->memory, 1); + page_counter_uncharge(&memcg->memory, nr_entries); if (memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) - page_counter_charge(&swap_memcg->memsw, 1); - page_counter_uncharge(&memcg->memsw, 1); + page_counter_charge(&swap_memcg->memsw, nr_entries); + page_counter_uncharge(&memcg->memsw, nr_entries); } /* @@ -5948,7 +6043,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for udpating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, false, -1); + mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), + -nr_entries); memcg_check_events(memcg, page); if (!mem_cgroup_is_root(memcg)) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1cd3b3569af8..88366626c0b7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } + arch_unmap_kpfn(pfn); + orig_head = hpage = compound_head(p); num_poisoned_pages_inc(); diff --git a/mm/memory.c b/mm/memory.c index fe2fba27ded2..ec4e15494901 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -49,6 +49,7 @@ #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/memremap.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/export.h> @@ -817,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, #else # define HAVE_PTE_SPECIAL 0 #endif -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); @@ -829,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; - if (!is_zero_pfn(pfn)) - print_bad_pte(vma, addr, pte, NULL); + if (is_zero_pfn(pfn)) + return NULL; + + /* + * Device public pages are special pages (they are ZONE_DEVICE + * pages but different from persistent memory). They behave + * allmost like normal pages. The difference is that they are + * not on the lru and thus should never be involve with any- + * thing that involve lru manipulation (mlock, numa balancing, + * ...). + * + * This is why we still want to return NULL for such page from + * vm_normal_page() so that we do not have to special case all + * call site of vm_normal_page(). + */ + if (likely(pfn < highest_memmap_pfn)) { + struct page *page = pfn_to_page(pfn); + + if (is_device_public_page(page)) { + if (with_public_device) + return page; + return NULL; + } + } + print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -956,6 +980,35 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_swp_mksoft_dirty(pte); set_pte_at(src_mm, addr, src_pte, pte); } + } else if (is_device_private_entry(entry)) { + page = device_private_entry_to_page(entry); + + /* + * Update rss count even for unaddressable pages, as + * they should treated just like normal pages in this + * respect. + * + * We will likely want to have some new rss counters + * for unaddressable pages, at some point. But for now + * keep things as they are. + */ + get_page(page); + rss[mm_counter(page)]++; + page_dup_rmap(page, false); + + /* + * We do not preserve soft-dirty information, because so + * far, checkpoint/restore is the only feature that + * requires that. And checkpoint/restore does not work + * when a device driver is involved (you cannot easily + * save and restore device driver state). + */ + if (is_write_device_private_entry(entry) && + is_cow_mapping(vm_flags)) { + make_device_private_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } } goto out_set_pte; } @@ -982,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; + } else if (pte_devmap(pte)) { + page = pte_page(pte); + + /* + * Cache coherent device memory behave like regular page and + * not like persistent memory page. For more informations see + * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h + */ + if (is_device_public_page(page)) { + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + } } out_set_pte: @@ -1065,7 +1131,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { + if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) + || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); err = copy_huge_pmd(dst_mm, src_mm, @@ -1236,7 +1303,7 @@ again: if (pte_present(ptent)) { struct page *page; - page = vm_normal_page(vma, addr, ptent); + page = _vm_normal_page(vma, addr, ptent, true); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -1273,6 +1340,29 @@ again: } continue; } + + entry = pte_to_swp_entry(ptent); + if (non_swap_entry(entry) && is_device_private_entry(entry)) { + struct page *page = device_private_entry_to_page(entry); + + if (unlikely(details && details->check_mapping)) { + /* + * unmap_shared_mapping_pages() wants to + * invalidate cache without truncating: + * unmap shared but keep private pages. + */ + if (details->check_mapping != + page_rmapping(page)) + continue; + } + + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + rss[mm_counter(page)]--; + page_remove_rmap(page, false); + put_page(page); + continue; + } + /* If details->check_mapping, we leave swap entries. */ if (unlikely(details)) continue; @@ -1326,7 +1416,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON_VMA(vma_is_anonymous(vma) && !rwsem_is_locked(&tlb->mm->mmap_sem), vma); @@ -1513,8 +1603,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) + for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { unmap_single_vma(&tlb, vma, start, end, NULL); + + /* + * zap_page_range does not specify whether mmap_sem should be + * held for read or write. That allows parallel zap_page_range + * operations to unmap a PTE and defer a flush meaning that + * this call observes pte_none and fails to flush the TLB. + * Rather than adding a complex API, ensure that no stale + * TLB entries exist when this call returns. + */ + flush_tlb_range(vma, start, end); + } + mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } @@ -1676,7 +1778,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, EXPORT_SYMBOL(vm_insert_page); static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, pgprot_t prot) + pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; int retval; @@ -1688,14 +1790,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (!pte) goto out; retval = -EBUSY; - if (!pte_none(*pte)) - goto out_unlock; + if (!pte_none(*pte)) { + if (mkwrite) { + /* + * For read faults on private mappings the PFN passed + * in may not match the PFN we have mapped if the + * mapped PFN is a writeable COW page. In the mkwrite + * case we are creating a writable PTE for a shared + * mapping and we expect the PFNs to match. + */ + if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn))) + goto out_unlock; + entry = *pte; + goto out_mkwrite; + } else + goto out_unlock; + } /* Ok, finally just insert the thing.. */ if (pfn_t_devmap(pfn)) entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); + +out_mkwrite: + if (mkwrite) { + entry = pte_mkyoung(entry); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + } + set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ @@ -1766,14 +1889,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); - ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); + ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, + false); return ret; } EXPORT_SYMBOL(vm_insert_pfn_prot); -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; @@ -1802,10 +1926,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, page = pfn_to_page(pfn_t_to_pfn(pfn)); return insert_page(vma, addr, page, pgprot); } - return insert_pfn(vma, addr, pfn, pgprot); + return insert_pfn(vma, addr, pfn, pgprot, mkwrite); +} + +int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) +{ + return __vm_insert_mixed(vma, addr, pfn, false); + } EXPORT_SYMBOL(vm_insert_mixed); +int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) +{ + return __vm_insert_mixed(vma, addr, pfn, true); +} +EXPORT_SYMBOL(vm_insert_mixed_mkwrite); + /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results @@ -2571,7 +2709,7 @@ static int do_wp_page(struct vm_fault *vmf) * not dirty accountable. */ if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { - int total_mapcount; + int total_map_swapcount; if (!trylock_page(vmf->page)) { get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2586,8 +2724,8 @@ static int do_wp_page(struct vm_fault *vmf) } put_page(vmf->page); } - if (reuse_swap_page(vmf->page, &total_mapcount)) { - if (total_mapcount == 1) { + if (reuse_swap_page(vmf->page, &total_map_swapcount)) { + if (total_map_swapcount == 1) { /* * The page is all ours. Move it to * our anon_vma so the rmap code will @@ -2623,7 +2761,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } -static inline void unmap_mapping_range_tree(struct rb_root *root, +static inline void unmap_mapping_range_tree(struct rb_root_cached *root, struct zap_details *details) { struct vm_area_struct *vma; @@ -2687,7 +2825,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; i_mmap_lock_write(mapping); - if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, &details); i_mmap_unlock_write(mapping); } @@ -2704,22 +2842,37 @@ EXPORT_SYMBOL(unmap_mapping_range); int do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page, *swapcache; + struct page *page = NULL, *swapcache; struct mem_cgroup *memcg; + struct vma_swap_readahead swap_ra; swp_entry_t entry; pte_t pte; int locked; int exclusive = 0; int ret = 0; + bool vma_readahead = swap_use_vma_readahead(); - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) + if (vma_readahead) + page = swap_readahead_detect(vmf, &swap_ra); + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) { + if (page) + put_page(page); goto out; + } entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); + } else if (is_device_private_entry(entry)) { + /* + * For un-addressable device memory we call the pgmap + * fault handler callback. The callback must migrate + * the page back to some CPU accessible page. + */ + ret = device_private_entry_fault(vma, vmf->address, entry, + vmf->flags, vmf->pmd); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { @@ -2729,10 +2882,16 @@ int do_swap_page(struct vm_fault *vmf) goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); - page = lookup_swap_cache(entry); + if (!page) + page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, + vmf->address); if (!page) { - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, - vmf->address); + if (vma_readahead) + page = do_swap_page_readahead(entry, + GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); + else + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!page) { /* * Back out if somebody else faulted in this pte @@ -3802,6 +3961,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), }; + unsigned int dirty = flags & FAULT_FLAG_WRITE; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; @@ -3824,7 +3984,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, barrier(); if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { - unsigned int dirty = flags & FAULT_FLAG_WRITE; /* NUMA case for anonymous PUDs would go here */ @@ -3850,12 +4009,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, pmd_t orig_pmd = *vmf.pmd; barrier(); + if (unlikely(is_swap_pmd(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + if (is_pmd_migration_entry(orig_pmd)) + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); - if ((vmf.flags & FAULT_FLAG_WRITE) && - !pmd_write(orig_pmd)) { + if (dirty && !pmd_write(orig_pmd)) { ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -3888,6 +4053,11 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_INSTRUCTION, + flags & FAULT_FLAG_REMOTE)) + return VM_FAULT_SIGSEGV; + /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. @@ -3895,11 +4065,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (flags & FAULT_FLAG_USER) mem_cgroup_oom_enable(); - if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, - flags & FAULT_FLAG_INSTRUCTION, - flags & FAULT_FLAG_REMOTE)) - return VM_FAULT_SIGSEGV; - if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else @@ -4008,7 +4173,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) #endif /* __PAGETABLE_PMD_FOLDED */ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) + unsigned long *start, unsigned long *end, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; p4d_t *p4d; @@ -4035,17 +4201,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, if (!pmdpp) goto out; + if (start && end) { + *start = address & PMD_MASK; + *end = *start + PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, *start, *end); + } *ptlp = pmd_lock(mm, pmd); if (pmd_huge(*pmd)) { *pmdpp = pmd; return 0; } spin_unlock(*ptlp); + if (start && end) + mmu_notifier_invalidate_range_end(mm, *start, *end); } if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; + if (start && end) { + *start = address & PAGE_MASK; + *end = *start + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, *start, *end); + } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!pte_present(*ptep)) goto unlock; @@ -4053,6 +4231,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, return 0; unlock: pte_unmap_unlock(ptep, *ptlp); + if (start && end) + mmu_notifier_invalidate_range_end(mm, *start, *end); out: return -EINVAL; } @@ -4064,20 +4244,21 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address, /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, ptepp, NULL, - ptlp))); + !(res = __follow_pte_pmd(mm, address, NULL, NULL, + ptepp, NULL, ptlp))); return res; } int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + unsigned long *start, unsigned long *end, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { int res; /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp, - ptlp))); + !(res = __follow_pte_pmd(mm, address, start, end, + ptepp, pmdpp, ptlp))); return res; } EXPORT_SYMBOL(follow_pte_pmd); @@ -4340,19 +4521,53 @@ static void clear_gigantic_page(struct page *page, } } void clear_huge_page(struct page *page, - unsigned long addr, unsigned int pages_per_huge_page) + unsigned long addr_hint, unsigned int pages_per_huge_page) { - int i; + int i, n, base, l; + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, pages_per_huge_page); return; } + /* Clear sub-page to access last to keep its cache lines hot */ might_sleep(); - for (i = 0; i < pages_per_huge_page; i++) { + n = (addr_hint - addr) / PAGE_SIZE; + if (2 * n <= pages_per_huge_page) { + /* If sub-page to access in first half of huge page */ + base = 0; + l = n; + /* Clear sub-pages at the end of huge page */ + for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } + } else { + /* If sub-page to access in second half of huge page */ + base = pages_per_huge_page - 2 * (pages_per_huge_page - n); + l = pages_per_huge_page - n; + /* Clear sub-pages at the begin of huge page */ + for (i = 0; i < base; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } + } + /* + * Clear remaining sub-pages in left-right-left-right pattern + * towards the sub-page to access + */ + for (i = 0; i < l; i++) { + int left_idx = base + i; + int right_idx = base + 2 * l - 1 - i; + + cond_resched(); + clear_user_highpage(page + left_idx, + addr + left_idx * PAGE_SIZE); cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); + clear_user_highpage(page + right_idx, + addr + right_idx * PAGE_SIZE); } } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8dccc317aac2..e882cb6da994 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -99,7 +99,7 @@ void mem_hotplug_done(void) /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { - struct resource *res; + struct resource *res, *conflict; res = kzalloc(sizeof(struct resource), GFP_KERNEL); if (!res) return ERR_PTR(-ENOMEM); @@ -108,7 +108,13 @@ static struct resource *register_memory_resource(u64 start, u64 size) res->start = start; res->end = start + size - 1; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - if (request_resource(&iomem_resource, res) < 0) { + conflict = request_resource_conflict(&iomem_resource, res); + if (conflict) { + if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { + pr_debug("Device unaddressable memory block " + "memory hotplug at %#010llx !\n", + (unsigned long long)start); + } pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); return ERR_PTR(-EEXIST); @@ -773,31 +779,6 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) -{ - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; - struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); - - /* - * TODO there shouldn't be any inherent reason to have ZONE_NORMAL - * physically before ZONE_MOVABLE. All we need is they do not - * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE - * though so let's stick with it for simplicity for now. - * TODO make sure we do not overlap with ZONE_DEVICE - */ - if (online_type == MMOP_ONLINE_KERNEL) { - if (zone_is_empty(movable_zone)) - return true; - return movable_zone->zone_start_pfn >= pfn + nr_pages; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - return zone_end_pfn(default_zone) <= pfn; - } - - /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ - return online_type == MMOP_ONLINE_KEEP; -} - static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { @@ -856,7 +837,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, * If no kernel zone covers this pfn range it will automatically go * to the ZONE_NORMAL. */ -struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, +static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { struct pglist_data *pgdat = NODE_DATA(nid); @@ -872,17 +853,40 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, return &pgdat->node_zones[ZONE_NORMAL]; } -static inline bool movable_pfn_range(int nid, struct zone *default_zone, - unsigned long start_pfn, unsigned long nr_pages) +static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, + unsigned long nr_pages) +{ + struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn, + nr_pages); + struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; + bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages); + bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages); + + /* + * We inherit the existing zone in a simple case where zones do not + * overlap in the given range + */ + if (in_kernel ^ in_movable) + return (in_kernel) ? kernel_zone : movable_zone; + + /* + * If the range doesn't belong to any zone or two zones overlap in the + * given range then we use movable zone only if movable_node is + * enabled because we always online to a kernel zone by default. + */ + return movable_node_enabled ? movable_zone : kernel_zone; +} + +struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, + unsigned long nr_pages) { - if (!allow_online_pfn_range(nid, start_pfn, nr_pages, - MMOP_ONLINE_KERNEL)) - return true; + if (online_type == MMOP_ONLINE_KERNEL) + return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); - if (!movable_node_is_enabled()) - return false; + if (online_type == MMOP_ONLINE_MOVABLE) + return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; - return !zone_intersects(default_zone, start_pfn, nr_pages); + return default_zone_for_pfn(nid, start_pfn, nr_pages); } /* @@ -892,28 +896,14 @@ static inline bool movable_pfn_range(int nid, struct zone *default_zone, static struct zone * __meminit move_pfn_range(int online_type, int nid, unsigned long start_pfn, unsigned long nr_pages) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); - - if (online_type == MMOP_ONLINE_KEEP) { - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; - /* - * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use - * movable zone if that is not possible (e.g. we are within - * or past the existing movable zone). movable_node overrides - * this default and defaults to movable zone - */ - if (movable_pfn_range(nid, zone, start_pfn, nr_pages)) - zone = movable_zone; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - zone = &pgdat->node_zones[ZONE_MOVABLE]; - } + struct zone *zone; + zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); move_pfn_range_to_zone(zone, start_pfn, nr_pages); return zone; } -/* Must be protected by mem_hotplug_begin() */ +/* Must be protected by mem_hotplug_begin() or a device_lock */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -925,9 +915,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ struct memory_notify arg; nid = pfn_to_nid(pfn); - if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) - return -EINVAL; - /* associate pfn range with the zone */ zone = move_pfn_range(online_type, nid, pfn, nr_pages); @@ -945,10 +932,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ - mutex_lock(&zonelists_mutex); if (!populated_zone(zone)) { need_zonelists_rebuild = 1; - build_all_zonelists(NULL, zone); + setup_zone_pageset(zone); } ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, @@ -956,7 +942,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (ret) { if (need_zonelists_rebuild) zone_pcp_reset(zone); - mutex_unlock(&zonelists_mutex); goto failed_addition; } @@ -969,13 +954,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) { node_states_set_node(nid, &arg); if (need_zonelists_rebuild) - build_all_zonelists(NULL, NULL); + build_all_zonelists(NULL); else zone_pcp_update(zone); } - mutex_unlock(&zonelists_mutex); - init_per_zone_wmark_min(); if (onlined_pages) { @@ -1046,9 +1029,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) * The node we allocated has no zone fallback lists. For avoiding * to access not-initialized zonelist, build here. */ - mutex_lock(&zonelists_mutex); - build_all_zonelists(pgdat, NULL); - mutex_unlock(&zonelists_mutex); + build_all_zonelists(pgdat); /* * zone->managed_pages is set to an approximate value in @@ -1100,13 +1081,6 @@ int try_online_node(int nid) node_set_online(nid); ret = register_one_node(nid); BUG_ON(ret); - - if (pgdat->node_zonelists->_zonerefs->zone == NULL) { - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); - } - out: mem_hotplug_done(); return ret; @@ -1412,7 +1386,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (isolate_huge_page(page, &source)) move_pages -= 1 << compound_order(head); continue; - } + } else if (thp_migration_supported() && PageTransHuge(page)) + pfn = page_to_pfn(compound_head(page)) + + hpage_nr_pages(page) - 1; if (!get_page_unless_zero(page)) continue; @@ -1722,9 +1698,7 @@ repeat: if (!populated_zone(zone)) { zone_pcp_reset(zone); - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); + build_all_zonelists(NULL); } else zone_pcp_update(zone); @@ -1750,7 +1724,7 @@ failed_removal: return ret; } -/* Must be protected by mem_hotplug_begin() */ +/* Must be protected by mem_hotplug_begin() or a device_lock */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 618ab125228b..006ba625c0b8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -97,6 +97,7 @@ #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> +#include <linux/swapops.h> #include <asm/tlbflush.h> #include <linux/uaccess.h> @@ -412,6 +413,64 @@ struct queue_pages { }; /* + * Check if the page's nid is in qp->nmask. + * + * If MPOL_MF_INVERT is set in qp->flags, check if the nid is + * in the invert of qp->nmask. + */ +static inline bool queue_pages_required(struct page *page, + struct queue_pages *qp) +{ + int nid = page_to_nid(page); + unsigned long flags = qp->flags; + + return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); +} + +static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + int ret = 0; + struct page *page; + struct queue_pages *qp = walk->private; + unsigned long flags; + + if (unlikely(is_pmd_migration_entry(*pmd))) { + ret = 1; + goto unlock; + } + page = pmd_page(*pmd); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + __split_huge_pmd(walk->vma, pmd, addr, false, NULL); + goto out; + } + if (!thp_migration_supported()) { + get_page(page); + spin_unlock(ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + goto out; + } + if (!queue_pages_required(page, qp)) { + ret = 1; + goto unlock; + } + + ret = 1; + flags = qp->flags; + /* go to thp migration */ + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_page_add(page, qp->pagelist, flags); +unlock: + spin_unlock(ptl); +out: + return ret; +} + +/* * Scan through pages checking if pages follow certain conditions, * and move them to the pagelist if they do. */ @@ -422,30 +481,15 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, struct page *page; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; - int nid, ret; + int ret; pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge(*pmd)) { - ptl = pmd_lock(walk->mm, pmd); - if (pmd_trans_huge(*pmd)) { - page = pmd_page(*pmd); - if (is_huge_zero_page(page)) { - spin_unlock(ptl); - __split_huge_pmd(vma, pmd, addr, false, NULL); - } else { - get_page(page); - spin_unlock(ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - if (ret) - return 0; - } - } else { - spin_unlock(ptl); - } + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + ret = queue_pages_pmd(pmd, ptl, addr, end, walk); + if (ret) + return 0; } if (pmd_trans_unstable(pmd)) @@ -464,10 +508,9 @@ retry: */ if (PageReserved(page)) continue; - nid = page_to_nid(page); - if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) + if (!queue_pages_required(page, qp)) continue; - if (PageTransCompound(page)) { + if (PageTransCompound(page) && !thp_migration_supported()) { get_page(page); pte_unmap_unlock(pte, ptl); lock_page(page); @@ -497,7 +540,6 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; - int nid; struct page *page; spinlock_t *ptl; pte_t entry; @@ -507,8 +549,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, if (!pte_present(entry)) goto unlock; page = pte_page(entry); - nid = page_to_nid(page); - if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) + if (!queue_pages_required(page, qp)) goto unlock; /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || @@ -881,19 +922,21 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, #ifdef CONFIG_MIGRATION /* - * page migration + * page migration, thp tail pages can be passed. */ static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags) { + struct page *head = compound_head(page); /* * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { - if (!isolate_lru_page(page)) { - list_add_tail(&page->lru, pagelist); - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) { + if (!isolate_lru_page(head)) { + list_add_tail(&head->lru, pagelist); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_cache(head), + hpage_nr_pages(head)); } } } @@ -903,7 +946,17 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), node); - else + else if (thp_migration_supported() && PageTransHuge(page)) { + struct page *thp; + + thp = alloc_pages_node(node, + (GFP_TRANSHUGE | __GFP_THISNODE), + HPAGE_PMD_ORDER); + if (!thp) + return NULL; + prep_transhuge_page(thp); + return thp; + } else return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } @@ -1069,6 +1122,15 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) if (PageHuge(page)) { BUG_ON(!vma); return alloc_huge_page_noerr(vma, address, 1); + } else if (thp_migration_supported() && PageTransHuge(page)) { + struct page *thp; + + thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, + HPAGE_PMD_ORDER); + if (!thp) + return NULL; + prep_transhuge_page(thp); + return thp; } /* * if !vma, alloc_page_vma() will use task or system default policy @@ -1683,8 +1745,7 @@ unsigned int mempolicy_slab_node(void) * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the * number of present nodes. */ -static unsigned offset_il_node(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long n) +static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) { unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; @@ -1717,7 +1778,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol, BUG_ON(shift < PAGE_SHIFT); off = vma->vm_pgoff >> (shift - PAGE_SHIFT); off += (addr - vma->vm_start) >> shift; - return offset_il_node(pol, vma, off); + return offset_il_node(pol, off); } else return interleave_nodes(pol); } @@ -2172,20 +2233,15 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long int polnid = -1; int ret = -1; - BUG_ON(!vma); - pol = get_vma_policy(vma, addr); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: - BUG_ON(addr >= vma->vm_end); - BUG_ON(addr < vma->vm_start); - pgoff = vma->vm_pgoff; pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; - polnid = offset_il_node(pol, vma, pgoff); + polnid = offset_il_node(pol, pgoff); break; case MPOL_PREFERRED: diff --git a/mm/migrate.c b/mm/migrate.c index e84eeb4e4356..6954c1435833 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -36,6 +36,9 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/gfp.h> +#include <linux/pfn_t.h> +#include <linux/memremap.h> +#include <linux/userfaultfd_k.h> #include <linux/balloon_compaction.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> @@ -185,8 +188,8 @@ void putback_movable_pages(struct list_head *l) unlock_page(page); put_page(page); } else { - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + + page_is_file_cache(page), -hpage_nr_pages(page)); putback_lru_page(page); } } @@ -216,6 +219,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, new = page - pvmw.page->index + linear_page_index(vma, pvmw.address); +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + /* PMD-mapped THP migration entry */ + if (!pvmw.pte) { + VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); + remove_migration_pmd(&pvmw, new); + continue; + } +#endif + get_page(new); pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(*pvmw.pte)) @@ -228,7 +240,17 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); - flush_dcache_page(new); + if (unlikely(is_zone_device_page(new))) { + if (is_device_private_page(new)) { + entry = make_device_private_entry(new, pte_write(pte)); + pte = swp_entry_to_pte(entry); + } else if (is_device_public_page(new)) { + pte = pte_mkdevmap(pte); + flush_dcache_page(new); + } + } else + flush_dcache_page(new); + #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { pte = pte_mkhuge(pte); @@ -330,6 +352,27 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, __migration_entry_wait(mm, pte, ptl); } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) +{ + spinlock_t *ptl; + struct page *page; + + ptl = pmd_lock(mm, pmd); + if (!is_pmd_migration_entry(*pmd)) + goto unlock; + page = migration_entry_to_page(pmd_to_swp_entry(*pmd)); + if (!get_page_unless_zero(page)) + goto unlock; + spin_unlock(ptl); + wait_on_page_locked(page); + put_page(page); + return; +unlock: + spin_unlock(ptl); +} +#endif + #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, @@ -398,6 +441,13 @@ int migrate_page_move_mapping(struct address_space *mapping, int expected_count = 1 + extra_count; void **pslot; + /* + * Device public or private pages have an extra refcount as they are + * ZONE_DEVICE pages. + */ + expected_count += is_device_private_page(page); + expected_count += is_device_public_page(page); + if (!mapping) { /* Anonymous page without mapping */ if (page_count(page) != expected_count) @@ -604,15 +654,10 @@ static void copy_huge_page(struct page *dst, struct page *src) /* * Copy the page to its new location */ -void migrate_page_copy(struct page *newpage, struct page *page) +void migrate_page_states(struct page *newpage, struct page *page) { int cpupid; - if (PageHuge(page) || PageTransHuge(page)) - copy_huge_page(newpage, page); - else - copy_highpage(newpage, page); - if (PageError(page)) SetPageError(newpage); if (PageReferenced(page)) @@ -666,6 +711,17 @@ void migrate_page_copy(struct page *newpage, struct page *page) mem_cgroup_migrate(page, newpage); } +EXPORT_SYMBOL(migrate_page_states); + +void migrate_page_copy(struct page *newpage, struct page *page) +{ + if (PageHuge(page) || PageTransHuge(page)) + copy_huge_page(newpage, page); + else + copy_highpage(newpage, page); + + migrate_page_states(newpage, page); +} EXPORT_SYMBOL(migrate_page_copy); /************************************************************ @@ -691,7 +747,10 @@ int migrate_page(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(migrate_page); @@ -741,12 +800,15 @@ int buffer_migrate_page(struct address_space *mapping, SetPagePrivate(newpage); - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); bh = head; do { unlock_buffer(bh); - put_bh(bh); + put_bh(bh); bh = bh->b_this_page; } while (bh != head); @@ -805,8 +867,13 @@ static int fallback_migrate_page(struct address_space *mapping, { if (PageDirty(page)) { /* Only writeback pages in full synchronous migration */ - if (mode != MIGRATE_SYNC) + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: return -EBUSY; + } return writeout(mapping, page); } @@ -943,7 +1010,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much */ - if (mode != MIGRATE_SYNC) { + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: rc = -EBUSY; goto out_unlock; } @@ -1088,7 +1159,7 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page))) { + if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) { lock_page(page); rc = split_huge_page(page); unlock_page(page); @@ -1116,8 +1187,8 @@ out: * as __PageMovable */ if (likely(!__PageMovable(page))) - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + + page_is_file_cache(page), -hpage_nr_pages(page)); } /* @@ -1213,8 +1284,15 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOMEM; if (!trylock_page(hpage)) { - if (!force || mode != MIGRATE_SYNC) + if (!force) + goto out; + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: goto out; + } lock_page(hpage); } @@ -1391,7 +1469,17 @@ static struct page *new_page_node(struct page *p, unsigned long private, if (PageHuge(p)) return alloc_huge_page_node(page_hstate(compound_head(p)), pm->node); - else + else if (thp_migration_supported() && PageTransHuge(p)) { + struct page *thp; + + thp = alloc_pages_node(pm->node, + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, + HPAGE_PMD_ORDER); + if (!thp) + return NULL; + prep_transhuge_page(thp); + return thp; + } else return __alloc_pages_node(pm->node, GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } @@ -1418,6 +1506,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, for (pp = pm; pp->node != MAX_NUMNODES; pp++) { struct vm_area_struct *vma; struct page *page; + struct page *head; + unsigned int follflags; err = -EFAULT; vma = find_vma(mm, pp->addr); @@ -1425,8 +1515,10 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto set_status; /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, pp->addr, - FOLL_GET | FOLL_SPLIT | FOLL_DUMP); + follflags = FOLL_GET | FOLL_DUMP; + if (!thp_migration_supported()) + follflags |= FOLL_SPLIT; + page = follow_page(vma, pp->addr, follflags); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1436,7 +1528,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!page) goto set_status; - pp->page = page; err = page_to_nid(page); if (err == pp->node) @@ -1451,16 +1542,22 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto put_and_set; if (PageHuge(page)) { - if (PageHead(page)) + if (PageHead(page)) { isolate_huge_page(page, &pagelist); + err = 0; + pp->page = page; + } goto put_and_set; } - err = isolate_lru_page(page); + pp->page = compound_head(page); + head = compound_head(page); + err = isolate_lru_page(head); if (!err) { - list_add_tail(&page->lru, &pagelist); - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + list_add_tail(&head->lru, &pagelist); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_cache(head), + hpage_nr_pages(head)); } put_and_set: /* @@ -2029,3 +2126,859 @@ out_unlock: #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */ + +#if defined(CONFIG_MIGRATE_VMA_HELPER) +struct migrate_vma { + struct vm_area_struct *vma; + unsigned long *dst; + unsigned long *src; + unsigned long cpages; + unsigned long npages; + unsigned long start; + unsigned long end; +}; + +static int migrate_vma_collect_hole(unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE; + migrate->dst[migrate->npages] = 0; + migrate->cpages++; + } + + return 0; +} + +static int migrate_vma_collect_skip(unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = 0; + } + + return 0; +} + +static int migrate_vma_collect_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + struct vm_area_struct *vma = walk->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start, unmapped = 0; + spinlock_t *ptl; + pte_t *ptep; + +again: + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, walk); + + if (pmd_trans_huge(*pmdp)) { + struct page *page; + + ptl = pmd_lock(mm, pmdp); + if (unlikely(!pmd_trans_huge(*pmdp))) { + spin_unlock(ptl); + goto again; + } + + page = pmd_page(*pmdp); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmdp, addr); + if (pmd_trans_unstable(pmdp)) + return migrate_vma_collect_skip(start, end, + walk); + } else { + int ret; + + get_page(page); + spin_unlock(ptl); + if (unlikely(!trylock_page(page))) + return migrate_vma_collect_skip(start, end, + walk); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return migrate_vma_collect_skip(start, end, + walk); + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, + walk); + } + } + + if (unlikely(pmd_bad(*pmdp))) + return migrate_vma_collect_skip(start, end, walk); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + arch_enter_lazy_mmu_mode(); + + for (; addr < end; addr += PAGE_SIZE, ptep++) { + unsigned long mpfn, pfn; + struct page *page; + swp_entry_t entry; + pte_t pte; + + pte = *ptep; + pfn = pte_pfn(pte); + + if (pte_none(pte)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + pfn = 0; + goto next; + } + + if (!pte_present(pte)) { + mpfn = pfn = 0; + + /* + * Only care about unaddressable device page special + * page table entry. Other special swap entries are not + * migratable, and we ignore regular swapped page. + */ + entry = pte_to_swp_entry(pte); + if (!is_device_private_entry(entry)) + goto next; + + page = device_private_entry_to_page(entry); + mpfn = migrate_pfn(page_to_pfn(page))| + MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; + if (is_write_device_private_entry(entry)) + mpfn |= MIGRATE_PFN_WRITE; + } else { + if (is_zero_pfn(pfn)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + pfn = 0; + goto next; + } + page = _vm_normal_page(migrate->vma, addr, pte, true); + mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; + } + + /* FIXME support THP */ + if (!page || !page->mapping || PageTransCompound(page)) { + mpfn = pfn = 0; + goto next; + } + pfn = page_to_pfn(page); + + /* + * By getting a reference on the page we pin it and that blocks + * any kind of migration. Side effect is that it "freezes" the + * pte. + * + * We drop this reference after isolating the page from the lru + * for non device page (device page are not on the lru and thus + * can't be dropped from it). + */ + get_page(page); + migrate->cpages++; + + /* + * Optimize for the common case where page is only mapped once + * in one process. If we can lock the page, then we can safely + * set up a special migration page table entry now. + */ + if (trylock_page(page)) { + pte_t swp_pte; + + mpfn |= MIGRATE_PFN_LOCKED; + ptep_get_and_clear(mm, addr, ptep); + + /* Setup special migration page table entry */ + entry = make_migration_entry(page, pte_write(pte)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, addr, ptep, swp_pte); + + /* + * This is like regular unmap: we remove the rmap and + * drop page refcount. Page won't be freed, as we took + * a reference just above. + */ + page_remove_rmap(page, false); + put_page(page); + + if (pte_present(pte)) + unmapped++; + } + +next: + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = mpfn; + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep - 1, ptl); + + /* Only flush the TLB if we actually modified any entries */ + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + return 0; +} + +/* + * migrate_vma_collect() - collect pages over a range of virtual addresses + * @migrate: migrate struct containing all migration information + * + * This will walk the CPU page table. For each virtual address backed by a + * valid page, it updates the src array and takes a reference on the page, in + * order to pin the page until we lock it and unmap it. + */ +static void migrate_vma_collect(struct migrate_vma *migrate) +{ + struct mm_walk mm_walk; + + mm_walk.pmd_entry = migrate_vma_collect_pmd; + mm_walk.pte_entry = NULL; + mm_walk.pte_hole = migrate_vma_collect_hole; + mm_walk.hugetlb_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.vma = migrate->vma; + mm_walk.mm = migrate->vma->vm_mm; + mm_walk.private = migrate; + + mmu_notifier_invalidate_range_start(mm_walk.mm, + migrate->start, + migrate->end); + walk_page_range(migrate->start, migrate->end, &mm_walk); + mmu_notifier_invalidate_range_end(mm_walk.mm, + migrate->start, + migrate->end); + + migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); +} + +/* + * migrate_vma_check_page() - check if page is pinned or not + * @page: struct page to check + * + * Pinned pages cannot be migrated. This is the same test as in + * migrate_page_move_mapping(), except that here we allow migration of a + * ZONE_DEVICE page. + */ +static bool migrate_vma_check_page(struct page *page) +{ + /* + * One extra ref because caller holds an extra reference, either from + * isolate_lru_page() for a regular page, or migrate_vma_collect() for + * a device page. + */ + int extra = 1; + + /* + * FIXME support THP (transparent huge page), it is bit more complex to + * check them than regular pages, because they can be mapped with a pmd + * or with a pte (split pte mapping). + */ + if (PageCompound(page)) + return false; + + /* Page from ZONE_DEVICE have one extra reference */ + if (is_zone_device_page(page)) { + /* + * Private page can never be pin as they have no valid pte and + * GUP will fail for those. Yet if there is a pending migration + * a thread might try to wait on the pte migration entry and + * will bump the page reference count. Sadly there is no way to + * differentiate a regular pin from migration wait. Hence to + * avoid 2 racing thread trying to migrate back to CPU to enter + * infinite loop (one stoping migration because the other is + * waiting on pte migration entry). We always return true here. + * + * FIXME proper solution is to rework migration_entry_wait() so + * it does not need to take a reference on page. + */ + if (is_device_private_page(page)) + return true; + + /* + * Only allow device public page to be migrated and account for + * the extra reference count imply by ZONE_DEVICE pages. + */ + if (!is_device_public_page(page)) + return false; + extra++; + } + + /* For file back page */ + if (page_mapping(page)) + extra += 1 + page_has_private(page); + + if ((page_count(page) - extra) > page_mapcount(page)) + return false; + + return true; +} + +/* + * migrate_vma_prepare() - lock pages and isolate them from the lru + * @migrate: migrate struct containing all migration information + * + * This locks pages that have been collected by migrate_vma_collect(). Once each + * page is locked it is isolated from the lru (for non-device pages). Finally, + * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be + * migrated by concurrent kernel threads. + */ +static void migrate_vma_prepare(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + unsigned long addr, i, restore = 0; + bool allow_drain = true; + + lru_add_drain(); + + for (i = 0; (i < npages) && migrate->cpages; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + bool remap = true; + + if (!page) + continue; + + if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) { + /* + * Because we are migrating several pages there can be + * a deadlock between 2 concurrent migration where each + * are waiting on each other page lock. + * + * Make migrate_vma() a best effort thing and backoff + * for any page we can not lock right away. + */ + if (!trylock_page(page)) { + migrate->src[i] = 0; + migrate->cpages--; + put_page(page); + continue; + } + remap = false; + migrate->src[i] |= MIGRATE_PFN_LOCKED; + } + + /* ZONE_DEVICE pages are not on LRU */ + if (!is_zone_device_page(page)) { + if (!PageLRU(page) && allow_drain) { + /* Drain CPU's pagevec */ + lru_add_drain_all(); + allow_drain = false; + } + + if (isolate_lru_page(page)) { + if (remap) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + } else { + migrate->src[i] = 0; + unlock_page(page); + migrate->cpages--; + put_page(page); + } + continue; + } + + /* Drop the reference we took in collect */ + put_page(page); + } + + if (!migrate_vma_check_page(page)) { + if (remap) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + + if (!is_zone_device_page(page)) { + get_page(page); + putback_lru_page(page); + } + } else { + migrate->src[i] = 0; + unlock_page(page); + migrate->cpages--; + + if (!is_zone_device_page(page)) + putback_lru_page(page); + else + put_page(page); + } + } + } + + for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + remove_migration_pte(page, migrate->vma, addr, page); + + migrate->src[i] = 0; + unlock_page(page); + put_page(page); + restore--; + } +} + +/* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Replace page mapping (CPU page table pte) with a special migration pte entry + * and check again if it has been pinned. Pinned pages are restored because we + * cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ +static void migrate_vma_unmap(struct migrate_vma *migrate) +{ + int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + unsigned long addr, i, restore = 0; + + for (i = 0; i < npages; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + if (page_mapped(page)) { + try_to_unmap(page, flags); + if (page_mapped(page)) + goto restore; + } + + if (migrate_vma_check_page(page)) + continue; + +restore: + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + } + + for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + remove_migration_ptes(page, page, false); + + migrate->src[i] = 0; + unlock_page(page); + restore--; + + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); + } +} + +static void migrate_vma_insert_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src, + unsigned long *dst) +{ + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + struct mem_cgroup *memcg; + bool flush = false; + spinlock_t *ptl; + pte_t entry; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + /* Only allow populating anonymous memory */ + if (!vma_is_anonymous(vma)) + goto abort; + + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + goto abort; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + goto abort; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + goto abort; + + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) + goto abort; + + /* + * Use pte_alloc() instead of pte_alloc_map(). We can't run + * pte_offset_map() on pmds where a huge pmd might be created + * from a different thread. + * + * pte_alloc_map() is safe to use under down_write(mmap_sem) or when + * parallel threads are excluded by other means. + * + * Here we only have down_read(mmap_sem). + */ + if (pte_alloc(mm, pmdp, addr)) + goto abort; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmdp))) + goto abort; + + if (unlikely(anon_vma_prepare(vma))) + goto abort; + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + goto abort; + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + if (is_zone_device_page(page)) { + if (is_device_private_page(page)) { + swp_entry_t swp_entry; + + swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); + entry = swp_entry_to_pte(swp_entry); + } else if (is_device_public_page(page)) { + entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + entry = pte_mkdevmap(entry); + } + } else { + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + } + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + + if (pte_present(*ptep)) { + unsigned long pfn = pte_pfn(*ptep); + + if (!is_zero_pfn(pfn)) { + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); + goto abort; + } + flush = true; + } else if (!pte_none(*ptep)) { + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); + goto abort; + } + + /* + * Check for usefaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); + goto abort; + } + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr, false); + mem_cgroup_commit_charge(page, memcg, false, false); + if (!is_zone_device_page(page)) + lru_cache_add_active_or_unevictable(page, vma); + get_page(page); + + if (flush) { + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } else { + /* No need to invalidate - it was non-present before */ + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } + + pte_unmap_unlock(ptep, ptl); + *src = MIGRATE_PFN_MIGRATE; + return; + +abort: + *src &= ~MIGRATE_PFN_MIGRATE; +} + +/* + * migrate_vma_pages() - migrate meta-data from src page to dst page + * @migrate: migrate struct containing all migration information + * + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. + */ +static void migrate_vma_pages(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr, i, mmu_start; + bool notified = false; + + for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct address_space *mapping; + int r; + + if (!newpage) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + + if (!page) { + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) { + continue; + } + if (!notified) { + mmu_start = addr; + notified = true; + mmu_notifier_invalidate_range_start(mm, + mmu_start, + migrate->end); + } + migrate_vma_insert_page(migrate, addr, newpage, + &migrate->src[i], + &migrate->dst[i]); + continue; + } + + mapping = page_mapping(page); + + if (is_zone_device_page(newpage)) { + if (is_device_private_page(newpage)) { + /* + * For now only support private anonymous when + * migrating to un-addressable device memory. + */ + if (mapping) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } else if (!is_device_public_page(newpage)) { + /* + * Other types of ZONE_DEVICE page are not + * supported. + */ + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } + + r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); + if (r != MIGRATEPAGE_SUCCESS) + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + } + + if (notified) + mmu_notifier_invalidate_range_end(mm, mmu_start, + migrate->end); +} + +/* + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ +static void migrate_vma_finalize(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + unsigned long i; + + for (i = 0; i < npages; i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + continue; + } + + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + newpage = page; + } + + remove_migration_ptes(page, newpage, false); + unlock_page(page); + migrate->cpages--; + + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); + + if (newpage != page) { + unlock_page(newpage); + if (is_zone_device_page(newpage)) + put_page(newpage); + else + putback_lru_page(newpage); + } + } +} + +/* + * migrate_vma() - migrate a range of memory inside vma + * + * @ops: migration callback for allocating destination memory and copying + * @vma: virtual memory area containing the range to be migrated + * @start: start address of the range to migrate (inclusive) + * @end: end address of the range to migrate (exclusive) + * @src: array of hmm_pfn_t containing source pfns + * @dst: array of hmm_pfn_t containing destination pfns + * @private: pointer passed back to each of the callback + * Returns: 0 on success, error code otherwise + * + * This function tries to migrate a range of memory virtual address range, using + * callbacks to allocate and copy memory from source to destination. First it + * collects all the pages backing each virtual address in the range, saving this + * inside the src array. Then it locks those pages and unmaps them. Once the pages + * are locked and unmapped, it checks whether each page is pinned or not. Pages + * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) + * in the corresponding src array entry. It then restores any pages that are + * pinned, by remapping and unlocking those pages. + * + * At this point it calls the alloc_and_copy() callback. For documentation on + * what is expected from that callback, see struct migrate_vma_ops comments in + * include/linux/migrate.h + * + * After the alloc_and_copy() callback, this function goes over each entry in + * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, + * then the function tries to migrate struct page information from the source + * struct page to the destination struct page. If it fails to migrate the struct + * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src + * array. + * + * At this point all successfully migrated pages have an entry in the src + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst + * array entry with MIGRATE_PFN_VALID flag set. + * + * It then calls the finalize_and_map() callback. See comments for "struct + * migrate_vma_ops", in include/linux/migrate.h for details about + * finalize_and_map() behavior. + * + * After the finalize_and_map() callback, for successfully migrated pages, this + * function updates the CPU page table to point to new pages, otherwise it + * restores the CPU page table to point to the original source pages. + * + * Function returns 0 after the above steps, even if no pages were migrated + * (The function only returns an error if any of the arguments are invalid.) + * + * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT + * unsigned long entries. + */ +int migrate_vma(const struct migrate_vma_ops *ops, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long *src, + unsigned long *dst, + void *private) +{ + struct migrate_vma migrate; + + /* Sanity check the arguments */ + start &= PAGE_MASK; + end &= PAGE_MASK; + if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) + return -EINVAL; + if (start < vma->vm_start || start >= vma->vm_end) + return -EINVAL; + if (end <= vma->vm_start || end > vma->vm_end) + return -EINVAL; + if (!ops || !src || !dst || start >= end) + return -EINVAL; + + memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); + migrate.src = src; + migrate.dst = dst; + migrate.start = start; + migrate.npages = 0; + migrate.cpages = 0; + migrate.end = end; + migrate.vma = vma; + + /* Collect, and try to unmap source pages */ + migrate_vma_collect(&migrate); + if (!migrate.cpages) + return 0; + + /* Lock and isolate page */ + migrate_vma_prepare(&migrate); + if (!migrate.cpages) + return 0; + + /* Unmap pages */ + migrate_vma_unmap(&migrate); + if (!migrate.cpages) + return 0; + + /* + * At this point pages are locked and unmapped, and thus they have + * stable content and can safely be copied to destination memory that + * is allocated by the callback. + * + * Note that migration can fail in migrate_vma_struct_page() for each + * individual page. + */ + ops->alloc_and_copy(vma, src, dst, start, end, private); + + /* This does the real migration of struct page */ + migrate_vma_pages(&migrate); + + ops->finalize_and_map(vma, src, dst, start, end, private); + + /* Unlock and remap pages */ + migrate_vma_finalize(&migrate); + + return 0; +} +EXPORT_SYMBOL(migrate_vma); +#endif /* defined(MIGRATE_VMA_HELPER) */ diff --git a/mm/mlock.c b/mm/mlock.c index b562b5523a65..dfc6f1912176 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -365,8 +365,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) * @start + PAGE_SIZE when no page could be added by the pte walk. */ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, - struct vm_area_struct *vma, int zoneid, unsigned long start, - unsigned long end) + struct vm_area_struct *vma, struct zone *zone, + unsigned long start, unsigned long end) { pte_t *pte; spinlock_t *ptl; @@ -394,7 +394,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, * Break if page could not be obtained or the page's node+zone does not * match */ - if (!page || page_zone_id(page) != zoneid) + if (!page || page_zone(page) != zone) break; /* @@ -446,7 +446,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long page_increm; struct pagevec pvec; struct zone *zone; - int zoneid; pagevec_init(&pvec, 0); /* @@ -481,7 +480,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, */ pagevec_add(&pvec, page); zone = page_zone(page); - zoneid = page_zone_id(page); /* * Try to fill the rest of pagevec using fast @@ -490,7 +488,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * pagevec. */ start = __munlock_pagevec_fill(&pvec, vma, - zoneid, start, end); + zone, start, end); __munlock_pagevec(&pvec, zone); goto next; } diff --git a/mm/mmap.c b/mm/mmap.c index f19efcf75418..680506faceae 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -44,6 +44,7 @@ #include <linux/userfaultfd_k.h> #include <linux/moduleparam.h> #include <linux/pkeys.h> +#include <linux/oom.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -684,7 +685,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; struct address_space *mapping = NULL; - struct rb_root *root = NULL; + struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; bool start_changed = false, end_changed = false; @@ -2639,13 +2640,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (vma->vm_start >= end) return 0; - if (uf) { - int error = userfaultfd_unmap_prep(vma, start, end, uf); - - if (error) - return error; - } - /* * If we need to split any vma, do it now to save pain later. * @@ -2679,6 +2673,21 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, } vma = prev ? prev->vm_next : mm->mmap; + if (unlikely(uf)) { + /* + * If userfaultfd_unmap_prep returns an error the vmas + * will remain splitted, but userland will get a + * highly unexpected error anyway. This is no + * different than the case where the first of the two + * __split_vma fails, but we don't undo the first + * split, despite we could. This is unlikely enough + * failure that it's not worth optimizing it for. + */ + int error = userfaultfd_unmap_prep(vma, start, end, uf); + if (error) + return error; + } + /* * unlock any mlock()ed ranges before detaching vmas */ @@ -2993,6 +3002,23 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); + set_bit(MMF_OOM_SKIP, &mm->flags); + if (unlikely(tsk_is_oom_victim(current))) { + /* + * Wait for oom_reap_task() to stop working on this + * mm. Because MMF_OOM_SKIP is already set before + * calling down_read(), oom_reap_task() will not run + * on this "mm" post up_write(). + * + * tsk_is_oom_victim() cannot be set from under us + * either because current->mm is already set to NULL + * under task_lock before calling mmput and oom_mm is + * set not NULL by the OOM killer only if current->mm + * is found not NULL while holding the task_lock. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); @@ -3314,7 +3340,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { - if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. @@ -3330,7 +3356,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * anon_vma->root->rwsem. */ if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->root->rb_root.rb_node)) + &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); } } @@ -3432,7 +3458,7 @@ out_unlock: static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { - if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. @@ -3446,7 +3472,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * anon_vma->root->rwsem. */ if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->root->rb_root.rb_node)) + &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); anon_vma_unlock_write(anon_vma); } @@ -3514,7 +3540,7 @@ static int init_user_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; @@ -3535,7 +3561,7 @@ static int init_admin_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; @@ -3579,7 +3605,7 @@ static int reserve_mem_notifier(struct notifier_block *nb, break; case MEM_OFFLINE: - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); if (sysctl_user_reserve_kbytes > free_kbytes) { init_user_reserve(); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 54ca54562928..314285284e6e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -174,20 +174,6 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -void __mmu_notifier_invalidate_page(struct mm_struct *mm, - unsigned long address) -{ - struct mmu_notifier *mn; - int id; - - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->invalidate_page) - mn->ops->invalidate_page(mn, mm, address); - } - srcu_read_unlock(&srcu, id); -} - void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { diff --git a/mm/mprotect.c b/mm/mprotect.c index bd0f409922cb..6d3e2f082290 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -125,6 +125,20 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pages++; } + + if (is_write_device_private_entry(entry)) { + pte_t newpte; + + /* + * We do not preserve soft-dirtiness. See + * copy_one_pte() for explanation. + */ + make_device_private_entry_read(&entry); + newpte = swp_entry_to_pte(entry); + set_pte_at(mm, addr, pte, newpte); + + pages++; + } } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); @@ -149,7 +163,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, unsigned long this_pages; next = pmd_addr_end(addr, end); - if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) + if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) && pmd_none_or_clear_bad(pmd)) continue; @@ -159,7 +173,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mni_start, end); } - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { __split_huge_pmd(vma, pmd, addr, false, NULL); } else { diff --git a/mm/mremap.c b/mm/mremap.c index 3f23715d3c69..cfec004c4ff9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -223,7 +223,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; - if (pmd_trans_huge(*old_pmd)) { + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE) { bool moved; /* See comment in move_ptes() */ @@ -384,6 +384,19 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (!vma || vma->vm_start > addr) return ERR_PTR(-EFAULT); + /* + * !old_len is a special case where an attempt is made to 'duplicate' + * a mapping. This makes no sense for private mappings as it will + * instead create a fresh/new mapping unrelated to the original. This + * is contrary to the basic idea of mremap which creates new mappings + * based on the original. There are no known use cases for this + * behavior. As a result, fail such attempts. + */ + if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { + pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid); + return ERR_PTR(-EINVAL); + } + if (is_vm_hugetlb_page(vma)) return ERR_PTR(-EINVAL); diff --git a/mm/nommu.c b/mm/nommu.c index fc184f597d59..17c00d93de2e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1164,17 +1164,12 @@ static int do_mmap_private(struct vm_area_struct *vma, if (vma->vm_file) { /* read the contents of a file into the copy */ - mm_segment_t old_fs; loff_t fpos; fpos = vma->vm_pgoff; fpos <<= PAGE_SHIFT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = __vfs_read(vma->vm_file, base, len, &fpos); - set_fs(old_fs); - + ret = kernel_read(vma->vm_file, base, len, &fpos); if (ret < 0) goto error_free; @@ -1962,7 +1957,7 @@ static int __meminit init_user_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; @@ -1983,7 +1978,7 @@ static int __meminit init_admin_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9e8b4f030c1c..99736e026712 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) } /* - * increase mm_users only after we know we will reap something so - * that the mmput_async is called only when we have reaped something - * and delayed __mmput doesn't matter that much + * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't + * work on the mm anymore. The check for MMF_OOM_SKIP must run + * under mmap_sem for reading because it serializes against the + * down_write();up_write() cycle in exit_mmap(). */ - if (!mmget_not_zero(mm)) { + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); goto unlock_oom; @@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) K(get_mm_counter(mm, MM_SHMEMPAGES))); up_read(&mm->mmap_sem); - /* - * Drop our reference but make sure the mmput slow path is called from a - * different context because we shouldn't risk we get stuck there and - * put the oom_reaper out of the way. - */ - mmput_async(mm); trace_finish_task_reaping(tsk->pid); unlock_oom: mutex_unlock(&oom_lock); @@ -824,7 +819,8 @@ static void oom_kill_process(struct oom_control *oc, const char *message) /* * If the task is already exiting, don't alarm the sysadmin or kill - * its children or threads, just set TIF_MEMDIE so it can die quickly + * its children or threads, just give it access to memory reserves + * so it can die quickly */ task_lock(p); if (task_will_free_mem(p)) { @@ -889,9 +885,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message) count_memcg_event_mm(mm, OOM_KILL); /* - * We should send SIGKILL before setting TIF_MEMDIE in order to prevent - * the OOM victim from depleting the memory reserves from the user - * space under its control. + * We should send SIGKILL before granting access to memory reserves + * in order to prevent the OOM victim from depleting the memory + * reserves from the user space under its control. */ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); mark_oom_victim(victim); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bf050ab025b7..0b9c5cbe8eba 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -363,7 +363,7 @@ static unsigned long global_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES); + x = global_zone_page_state(NR_FREE_PAGES); /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to @@ -1405,7 +1405,7 @@ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive - * global_page_state() too often. So scale it near-sqrt to the safety margin + * global_zone_page_state() too often. So scale it near-sqrt to the safety margin * (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7a58eb5757e3..c841af88836a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -66,6 +66,7 @@ #include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> +#include <linux/lockdep.h> #include <linux/nmi.h> #include <asm/sections.h> @@ -2740,18 +2741,18 @@ int __isolate_free_page(struct page *page, unsigned int order) static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA - enum zone_stat_item local_stat = NUMA_LOCAL; + enum numa_stat_item local_stat = NUMA_LOCAL; if (z->node != numa_node_id()) local_stat = NUMA_OTHER; if (z->node == preferred_zone->node) - __inc_zone_state(z, NUMA_HIT); + __inc_numa_state(z, NUMA_HIT); else { - __inc_zone_state(z, NUMA_MISS); - __inc_zone_state(preferred_zone, NUMA_FOREIGN); + __inc_numa_state(z, NUMA_MISS); + __inc_numa_state(preferred_zone, NUMA_FOREIGN); } - __inc_zone_state(z, local_stat); + __inc_numa_state(z, local_stat); #endif } @@ -2950,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, { long min = mark; int o; - const bool alloc_harder = (alloc_flags & ALLOC_HARDER); + const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -2963,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * the high-atomic reserves. This will over-estimate the size of the * atomic reserve but it avoids a search. */ - if (likely(!alloc_harder)) + if (likely(!alloc_harder)) { free_pages -= z->nr_reserved_highatomic; - else - min -= min / 4; + } else { + /* + * OOM victims can try even harder than normal ALLOC_HARDER + * users on the grounds that it's definitely going to be in + * the exit path shortly and free memory. Any allocation it + * makes during the free path will be small and short-lived. + */ + if (alloc_flags & ALLOC_OOM) + min -= min / 2; + else + min -= min / 4; + } + #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ @@ -3204,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) * of allowed nodes. */ if (!(gfp_mask & __GFP_NOMEMALLOC)) - if (test_thread_flag(TIF_MEMDIE) || + if (tsk_is_oom_victim(current) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) @@ -3291,10 +3303,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if - * we're still under heavy pressure. + * we're still under heavy pressure. But make sure that this reclaim + * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY + * allocation which will never fail due to oom_lock already held. */ - page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, - ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); + page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & + ~__GFP_DIRECT_RECLAIM, order, + ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); if (page) goto out; @@ -3510,6 +3525,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla } #endif /* CONFIG_COMPACTION */ +#ifdef CONFIG_LOCKDEP +struct lockdep_map __fs_reclaim_map = + STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); + +static bool __need_fs_reclaim(gfp_t gfp_mask) +{ + gfp_mask = current_gfp_context(gfp_mask); + + /* no reclaim without waiting on it */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) + return false; + + /* this guy won't enter reclaim */ + if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + return false; + + /* We're only interested __GFP_FS allocations for now */ + if (!(gfp_mask & __GFP_FS)) + return false; + + if (gfp_mask & __GFP_NOLOCKDEP) + return false; + + return true; +} + +void fs_reclaim_acquire(gfp_t gfp_mask) +{ + if (__need_fs_reclaim(gfp_mask)) + lock_map_acquire(&__fs_reclaim_map); +} +EXPORT_SYMBOL_GPL(fs_reclaim_acquire); + +void fs_reclaim_release(gfp_t gfp_mask) +{ + if (__need_fs_reclaim(gfp_mask)) + lock_map_release(&__fs_reclaim_map); +} +EXPORT_SYMBOL_GPL(fs_reclaim_release); +#endif + /* Perform direct synchronous page reclaim */ static int __perform_reclaim(gfp_t gfp_mask, unsigned int order, @@ -3524,7 +3580,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); noreclaim_flag = memalloc_noreclaim_save(); - lockdep_set_current_reclaim_state(gfp_mask); + fs_reclaim_acquire(gfp_mask); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3532,7 +3588,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, ac->nodemask); current->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); + fs_reclaim_release(gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); cond_resched(); @@ -3623,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask) return alloc_flags; } -bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +static bool oom_reserves_allowed(struct task_struct *tsk) { - if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + if (!tsk_is_oom_victim(tsk)) + return false; + + /* + * !MMU doesn't have oom reaper so give access to memory reserves + * only to the thread with TIF_MEMDIE set + */ + if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) return false; + return true; +} + +/* + * Distinguish requests which really need access to full memory + * reserves from oom victims which can live with a portion of it + */ +static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) +{ + if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + return 0; if (gfp_mask & __GFP_MEMALLOC) - return true; + return ALLOC_NO_WATERMARKS; if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) - return true; - if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) - return true; + return ALLOC_NO_WATERMARKS; + if (!in_interrupt()) { + if (current->flags & PF_MEMALLOC) + return ALLOC_NO_WATERMARKS; + else if (oom_reserves_allowed(current)) + return ALLOC_OOM; + } - return false; + return 0; +} + +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ + return !!__gfp_pfmemalloc_flags(gfp_mask); } /* @@ -3790,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long alloc_start = jiffies; unsigned int stall_timeout = 10 * HZ; unsigned int cpuset_mems_cookie; + int reserve_flags; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3895,15 +3977,16 @@ retry: if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); - if (gfp_pfmemalloc_allowed(gfp_mask)) - alloc_flags = ALLOC_NO_WATERMARKS; + reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); + if (reserve_flags) + alloc_flags = reserve_flags; /* * Reset the zonelist iterators if memory policies can be ignored. * These allocations are high priority and system rather than user * orientated. */ - if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { + if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); @@ -3980,8 +4063,8 @@ retry: goto got_pg; /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE) && - (alloc_flags == ALLOC_NO_WATERMARKS || + if (tsk_is_oom_victim(current) && + (alloc_flags == ALLOC_OOM || (gfp_mask & __GFP_NOMEMALLOC))) goto nopage; @@ -4061,7 +4144,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, *alloc_flags |= ALLOC_CPUSET; } - lockdep_trace_alloc(gfp_mask); + fs_reclaim_acquire(gfp_mask); + fs_reclaim_release(gfp_mask); might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); @@ -4099,10 +4183,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ + gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { }; gfp_mask &= gfp_allowed_mask; + alloc_mask = gfp_mask; if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; @@ -4463,7 +4548,7 @@ long si_mem_available(void) * Estimate the amount of memory available for userspace allocations, * without causing swapping. */ - available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; + available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will @@ -4492,7 +4577,7 @@ void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; val->sharedram = global_node_page_state(NR_SHMEM); - val->freeram = global_page_state(NR_FREE_PAGES); + val->freeram = global_zone_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; val->freehigh = nr_free_highpages(); @@ -4627,11 +4712,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), - global_page_state(NR_PAGETABLE), - global_page_state(NR_BOUNCE), - global_page_state(NR_FREE_PAGES), + global_zone_page_state(NR_PAGETABLE), + global_zone_page_state(NR_BOUNCE), + global_zone_page_state(NR_FREE_PAGES), free_pcp, - global_page_state(NR_FREE_CMA_PAGES)); + global_zone_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) @@ -4793,18 +4878,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) * * Add all populated zones of a node to the zonelist. */ -static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, - int nr_zones) +static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) { struct zone *zone; enum zone_type zone_type = MAX_NR_ZONES; + int nr_zones = 0; do { zone_type--; zone = pgdat->node_zones + zone_type; if (managed_zone(zone)) { - zoneref_set_zone(zone, - &zonelist->_zonerefs[nr_zones++]); + zoneref_set_zone(zone, &zonerefs[nr_zones++]); check_highest_zone(zone_type); } } while (zone_type); @@ -4812,52 +4896,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, return nr_zones; } - -/* - * zonelist_order: - * 0 = automatic detection of better ordering. - * 1 = order by ([node] distance, -zonetype) - * 2 = order by (-zonetype, [node] distance) - * - * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create - * the same zonelist. So only NUMA can configure this param. - */ -#define ZONELIST_ORDER_DEFAULT 0 -#define ZONELIST_ORDER_NODE 1 -#define ZONELIST_ORDER_ZONE 2 - -/* zonelist order in the kernel. - * set_zonelist_order() will set this to NODE or ZONE. - */ -static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; -static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; - - #ifdef CONFIG_NUMA -/* The value user specified ....changed by config */ -static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; -/* string for sysctl */ -#define NUMA_ZONELIST_ORDER_LEN 16 -char numa_zonelist_order[16] = "default"; - -/* - * interface for configure zonelist ordering. - * command line option "numa_zonelist_order" - * = "[dD]efault - default, automatic configuration. - * = "[nN]ode - order by node locality, then by zone within node - * = "[zZ]one - order by zone, then by locality within zone - */ static int __parse_numa_zonelist_order(char *s) { - if (*s == 'd' || *s == 'D') { - user_zonelist_order = ZONELIST_ORDER_DEFAULT; - } else if (*s == 'n' || *s == 'N') { - user_zonelist_order = ZONELIST_ORDER_NODE; - } else if (*s == 'z' || *s == 'Z') { - user_zonelist_order = ZONELIST_ORDER_ZONE; - } else { - pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); + /* + * We used to support different zonlists modes but they turned + * out to be just not useful. Let's keep the warning in place + * if somebody still use the cmd line parameter so that we do + * not fail it silently + */ + if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { + pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -4865,19 +4915,15 @@ static int __parse_numa_zonelist_order(char *s) static __init int setup_numa_zonelist_order(char *s) { - int ret; - if (!s) return 0; - ret = __parse_numa_zonelist_order(s); - if (ret == 0) - strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); - - return ret; + return __parse_numa_zonelist_order(s); } early_param("numa_zonelist_order", setup_numa_zonelist_order); +char numa_zonelist_order[] = "Node"; + /* * sysctl handler for numa_zonelist_order */ @@ -4885,42 +4931,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - char saved_string[NUMA_ZONELIST_ORDER_LEN]; + char *str; int ret; - static DEFINE_MUTEX(zl_order_mutex); - mutex_lock(&zl_order_mutex); - if (write) { - if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { - ret = -EINVAL; - goto out; - } - strcpy(saved_string, (char *)table->data); - } - ret = proc_dostring(table, write, buffer, length, ppos); - if (ret) - goto out; - if (write) { - int oldval = user_zonelist_order; + if (!write) + return proc_dostring(table, write, buffer, length, ppos); + str = memdup_user_nul(buffer, 16); + if (IS_ERR(str)) + return PTR_ERR(str); - ret = __parse_numa_zonelist_order((char *)table->data); - if (ret) { - /* - * bogus value. restore saved string - */ - strncpy((char *)table->data, saved_string, - NUMA_ZONELIST_ORDER_LEN); - user_zonelist_order = oldval; - } else if (oldval != user_zonelist_order) { - mem_hotplug_begin(); - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); - mem_hotplug_done(); - } - } -out: - mutex_unlock(&zl_order_mutex); + ret = __parse_numa_zonelist_order(str); + kfree(str); return ret; } @@ -4994,17 +5015,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) * This results in maximum locality--normal zone overflows into local * DMA zone, if any--but risks exhausting DMA zone. */ -static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) +static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, + unsigned nr_nodes) { - int j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int i; + + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; + + for (i = 0; i < nr_nodes; i++) { + int nr_zones; + + pg_data_t *node = NODE_DATA(node_order[i]); - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) - ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + nr_zones = build_zonerefs_node(node, zonerefs); + zonerefs += nr_zones; + } + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } /* @@ -5012,13 +5040,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) */ static void build_thisnode_zonelists(pg_data_t *pgdat) { - int j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int nr_zones; - zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; - j = build_zonelists_node(pgdat, zonelist, 0); - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; + nr_zones = build_zonerefs_node(pgdat, zonerefs); + zonerefs += nr_zones; + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } /* @@ -5027,79 +5056,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) * exhausted, but results in overflowing to remote node while memory * may still exist in local DMA zone. */ -static int node_order[MAX_NUMNODES]; - -static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) -{ - int pos, j, node; - int zone_type; /* needs to be signed */ - struct zone *z; - struct zonelist *zonelist; - - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - pos = 0; - for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { - for (j = 0; j < nr_nodes; j++) { - node = node_order[j]; - z = &NODE_DATA(node)->node_zones[zone_type]; - if (managed_zone(z)) { - zoneref_set_zone(z, - &zonelist->_zonerefs[pos++]); - check_highest_zone(zone_type); - } - } - } - zonelist->_zonerefs[pos].zone = NULL; - zonelist->_zonerefs[pos].zone_idx = 0; -} - -#if defined(CONFIG_64BIT) -/* - * Devices that require DMA32/DMA are relatively rare and do not justify a - * penalty to every machine in case the specialised case applies. Default - * to Node-ordering on 64-bit NUMA machines - */ -static int default_zonelist_order(void) -{ - return ZONELIST_ORDER_NODE; -} -#else -/* - * On 32-bit, the Normal zone needs to be preserved for allocations accessible - * by the kernel. If processes running on node 0 deplete the low memory zone - * then reclaim will occur more frequency increasing stalls and potentially - * be easier to OOM if a large percentage of the zone is under writeback or - * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. - * Hence, default to zone ordering on 32-bit. - */ -static int default_zonelist_order(void) -{ - return ZONELIST_ORDER_ZONE; -} -#endif /* CONFIG_64BIT */ - -static void set_zonelist_order(void) -{ - if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) - current_zonelist_order = default_zonelist_order(); - else - current_zonelist_order = user_zonelist_order; -} static void build_zonelists(pg_data_t *pgdat) { - int i, node, load; + static int node_order[MAX_NUMNODES]; + int node, load, nr_nodes = 0; nodemask_t used_mask; int local_node, prev_node; - struct zonelist *zonelist; - unsigned int order = current_zonelist_order; - - /* initialize zonelists */ - for (i = 0; i < MAX_ZONELISTS; i++) { - zonelist = pgdat->node_zonelists + i; - zonelist->_zonerefs[0].zone = NULL; - zonelist->_zonerefs[0].zone_idx = 0; - } /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; @@ -5108,8 +5071,6 @@ static void build_zonelists(pg_data_t *pgdat) nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); - i = 0; - while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { /* * We don't want to pressure a particular node. @@ -5120,19 +5081,12 @@ static void build_zonelists(pg_data_t *pgdat) node_distance(local_node, prev_node)) node_load[node] = load; + node_order[nr_nodes++] = node; prev_node = node; load--; - if (order == ZONELIST_ORDER_NODE) - build_zonelists_in_node_order(pgdat, node); - else - node_order[i++] = node; /* remember order */ - } - - if (order == ZONELIST_ORDER_ZONE) { - /* calculate node order -- i.e., DMA last! */ - build_zonelists_in_zone_order(pgdat, i); } + build_zonelists_in_node_order(pgdat, node_order, nr_nodes); build_thisnode_zonelists(pgdat); } @@ -5158,21 +5112,17 @@ static void setup_min_unmapped_ratio(void); static void setup_min_slab_ratio(void); #else /* CONFIG_NUMA */ -static void set_zonelist_order(void) -{ - current_zonelist_order = ZONELIST_ORDER_ZONE; -} - static void build_zonelists(pg_data_t *pgdat) { int node, local_node; - enum zone_type j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int nr_zones; local_node = pgdat->node_id; - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - j = build_zonelists_node(pgdat, zonelist, 0); + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; + nr_zones = build_zonerefs_node(pgdat, zonerefs); + zonerefs += nr_zones; /* * Now we build the zonelist so that it contains the zones @@ -5185,16 +5135,18 @@ static void build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); + nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); + zonerefs += nr_zones; } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); + nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); + zonerefs += nr_zones; } - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } #endif /* CONFIG_NUMA */ @@ -5217,50 +5169,32 @@ static void build_zonelists(pg_data_t *pgdat) static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); -static void setup_zone_pageset(struct zone *zone); -/* - * Global mutex to protect against size modification of zonelists - * as well as to serialize pageset setup for the new populated zone. - */ -DEFINE_MUTEX(zonelists_mutex); - -/* return values int ....just for stop_machine() */ -static int __build_all_zonelists(void *data) +static void __build_all_zonelists(void *data) { int nid; - int cpu; + int __maybe_unused cpu; pg_data_t *self = data; + static DEFINE_SPINLOCK(lock); + + spin_lock(&lock); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); #endif + /* + * This node is hotadded and no memory is yet present. So just + * building zonelists is fine - no need to touch other nodes. + */ if (self && !node_online(self->node_id)) { build_zonelists(self); - } - - for_each_online_node(nid) { - pg_data_t *pgdat = NODE_DATA(nid); - - build_zonelists(pgdat); - } + } else { + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); - /* - * Initialize the boot_pagesets that are going to be used - * for bootstrapping processors. The real pagesets for - * each zone will be allocated later when the per cpu - * allocator is available. - * - * boot_pagesets are used also for bootstrapping offline - * cpus if the system is already booted because the pagesets - * are needed to initialize allocators on a specific cpu too. - * F.e. the percpu allocator needs the page allocator which - * needs the percpu allocator in order to allocate its pagesets - * (a chicken-egg dilemma). - */ - for_each_possible_cpu(cpu) { - setup_pageset(&per_cpu(boot_pageset, cpu), 0); + build_zonelists(pgdat); + } #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* @@ -5271,45 +5205,53 @@ static int __build_all_zonelists(void *data) * secondary cpus' numa_mem as they come on-line. During * node/memory hotplug, we'll fixup all on-line cpus. */ - if (cpu_online(cpu)) + for_each_online_cpu(cpu) set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); #endif } - return 0; + spin_unlock(&lock); } static noinline void __init build_all_zonelists_init(void) { + int cpu; + __build_all_zonelists(NULL); + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) + setup_pageset(&per_cpu(boot_pageset, cpu), 0); + mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } /* - * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. * - * __ref due to (1) call of __meminit annotated setup_zone_pageset - * [we're only called with non-NULL zone through __meminit paths] and - * (2) call of __init annotated helper build_all_zonelists_init + * __ref due to call of __init annotated helper build_all_zonelists_init * [protected by SYSTEM_BOOTING]. */ -void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) +void __ref build_all_zonelists(pg_data_t *pgdat) { - set_zonelist_order(); - if (system_state == SYSTEM_BOOTING) { build_all_zonelists_init(); } else { -#ifdef CONFIG_MEMORY_HOTPLUG - if (zone) - setup_zone_pageset(zone); -#endif - /* we have to stop all cpus to guarantee there is no user - of zonelist */ - stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL); + __build_all_zonelists(pgdat); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); @@ -5325,9 +5267,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", + pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", nr_online_nodes, - zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); #ifdef CONFIG_NUMA @@ -5581,7 +5522,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) pageset_set_high_and_batch(zone, pcp); } -static void __meminit setup_zone_pageset(struct zone *zone) +void __meminit setup_zone_pageset(struct zone *zone) { int cpu; zone->pageset = alloc_percpu(struct per_cpu_pageset); @@ -7035,9 +6976,11 @@ static void __setup_per_zone_wmarks(void) */ void setup_per_zone_wmarks(void) { - mutex_lock(&zonelists_mutex); + static DEFINE_SPINLOCK(lock); + + spin_lock(&lock); __setup_per_zone_wmarks(); - mutex_unlock(&zonelists_mutex); + spin_unlock(&lock); } /* diff --git a/mm/page_ext.c b/mm/page_ext.c index 88ccc044b09a..32f18911deda 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -222,10 +222,7 @@ static void *__meminit alloc_page_ext(size_t size, int nid) return addr; } - if (node_state(nid, N_HIGH_MEMORY)) - addr = vzalloc_node(size, nid); - else - addr = vzalloc(size); + addr = vzalloc_node(size, nid); return addr; } @@ -409,6 +406,7 @@ void __init page_ext_init(void) continue; if (init_section_page_ext(pfn, nid)) goto oom; + cond_resched(); } } hotplug_memory_notifier(page_ext_callback, 0); diff --git a/mm/page_idle.c b/mm/page_idle.c index 1b0f48c62316..4bd03a8d809e 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -204,7 +204,7 @@ static struct bin_attribute *page_idle_bin_attrs[] = { NULL, }; -static struct attribute_group page_idle_attr_group = { +static const struct attribute_group page_idle_attr_group = { .bin_attrs = page_idle_bin_attrs, .name = "page_idle", }; diff --git a/mm/page_io.c b/mm/page_io.c index 5f61b54ee1f3..21502d341a67 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -28,16 +28,21 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, struct page *page, bio_end_io_t end_io) { + int i, nr = hpage_nr_pages(page); struct bio *bio; - bio = bio_alloc(gfp_flags, 1); + bio = bio_alloc(gfp_flags, nr); if (bio) { - bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); + struct block_device *bdev; + + bio->bi_iter.bi_sector = map_swap_page(page, &bdev); + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; bio->bi_end_io = end_io; - bio_add_page(bio, page, PAGE_SIZE, 0); - BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE); + for (i = 0; i < nr; i++) + bio_add_page(bio, page + i, PAGE_SIZE, 0); + VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr); } return bio; } @@ -58,8 +63,7 @@ void end_swap_bio_write(struct bio *bio) */ set_page_dirty(page); pr_alert("Write-error on swap-device (%u:%u:%llu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); ClearPageReclaim(page); } @@ -124,8 +128,7 @@ static void end_swap_bio_read(struct bio *bio) SetPageError(page); ClearPageUptodate(page); pr_alert("Read-error on swap-device (%u:%u:%llu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); goto out; } @@ -262,6 +265,15 @@ static sector_t swap_page_sector(struct page *page) return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9); } +static inline void count_swpout_vm_event(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (unlikely(PageTransHuge(page))) + count_vm_event(THP_SWPOUT); +#endif + count_vm_events(PSWPOUT, hpage_nr_pages(page)); +} + int __swap_writepage(struct page *page, struct writeback_control *wbc, bio_end_io_t end_write_func) { @@ -313,7 +325,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); if (!ret) { - count_vm_event(PSWPOUT); + count_swpout_vm_event(page); return 0; } @@ -326,7 +338,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, goto out; } bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); - count_vm_event(PSWPOUT); + count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); submit_bio(bio); @@ -340,7 +352,7 @@ int swap_readpage(struct page *page, bool do_poll) int ret = 0; struct swap_info_struct *sis = page_swap_info(page); blk_qc_t qc; - struct block_device *bdev; + struct gendisk *disk; VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -379,7 +391,7 @@ int swap_readpage(struct page *page, bool do_poll) ret = -ENOMEM; goto out; } - bdev = bio->bi_bdev; + disk = bio->bi_disk; /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. @@ -395,7 +407,7 @@ int swap_readpage(struct page *page, bool do_poll) if (!READ_ONCE(bio->bi_private)) break; - if (!blk_mq_poll(bdev_get_queue(bdev), qc)) + if (!blk_mq_poll(disk->queue, qc)) break; } __set_current_state(TASK_RUNNING); diff --git a/mm/page_owner.c b/mm/page_owner.c index 0fd9dcf2c5dc..57abca62d4db 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -30,6 +30,7 @@ DEFINE_STATIC_KEY_FALSE(page_owner_inited); static depot_stack_handle_t dummy_handle; static depot_stack_handle_t failure_handle; +static depot_stack_handle_t early_handle; static void init_early_allocated_pages(void); @@ -53,7 +54,7 @@ static bool need_page_owner(void) return true; } -static noinline void register_dummy_stack(void) +static __always_inline depot_stack_handle_t create_dummy_stack(void) { unsigned long entries[4]; struct stack_trace dummy; @@ -64,21 +65,22 @@ static noinline void register_dummy_stack(void) dummy.skip = 0; save_stack_trace(&dummy); - dummy_handle = depot_save_stack(&dummy, GFP_KERNEL); + return depot_save_stack(&dummy, GFP_KERNEL); } -static noinline void register_failure_stack(void) +static noinline void register_dummy_stack(void) { - unsigned long entries[4]; - struct stack_trace failure; + dummy_handle = create_dummy_stack(); +} - failure.nr_entries = 0; - failure.max_entries = ARRAY_SIZE(entries); - failure.entries = &entries[0]; - failure.skip = 0; +static noinline void register_failure_stack(void) +{ + failure_handle = create_dummy_stack(); +} - save_stack_trace(&failure); - failure_handle = depot_save_stack(&failure, GFP_KERNEL); +static noinline void register_early_stack(void) +{ + early_handle = create_dummy_stack(); } static void init_page_owner(void) @@ -88,6 +90,7 @@ static void init_page_owner(void) register_dummy_stack(); register_failure_stack(); + register_early_stack(); static_branch_enable(&page_owner_inited); init_early_allocated_pages(); } @@ -139,7 +142,7 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) .nr_entries = 0, .entries = entries, .max_entries = PAGE_OWNER_STACK_DEPTH, - .skip = 0 + .skip = 2 }; depot_stack_handle_t handle; @@ -165,17 +168,13 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) return handle; } -noinline void __set_page_owner(struct page *page, unsigned int order, - gfp_t gfp_mask) +static inline void __set_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask) { - struct page_ext *page_ext = lookup_page_ext(page); struct page_owner *page_owner; - if (unlikely(!page_ext)) - return; - page_owner = get_page_owner(page_ext); - page_owner->handle = save_stack(gfp_mask); + page_owner->handle = handle; page_owner->order = order; page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; @@ -183,6 +182,19 @@ noinline void __set_page_owner(struct page *page, unsigned int order, __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } +noinline void __set_page_owner(struct page *page, unsigned int order, + gfp_t gfp_mask) +{ + struct page_ext *page_ext = lookup_page_ext(page); + depot_stack_handle_t handle; + + if (unlikely(!page_ext)) + return; + + handle = save_stack(gfp_mask); + __set_page_owner_handle(page_ext, handle, order, gfp_mask); +} + void __set_page_owner_migrate_reason(struct page *page, int reason) { struct page_ext *page_ext = lookup_page_ext(page); @@ -550,11 +562,17 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; /* - * We are safe to check buddy flag and order, because - * this is init stage and only single thread runs. + * To avoid having to grab zone->lock, be a little + * careful when reading buddy page order. The only + * danger is that we skip too much and potentially miss + * some early allocated pages, which is better than + * heavy lock contention. */ if (PageBuddy(page)) { - pfn += (1UL << page_order(page)) - 1; + unsigned long order = page_order_unsafe(page); + + if (order > 0 && order < MAX_ORDER) + pfn += (1UL << order) - 1; continue; } @@ -565,14 +583,15 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (unlikely(!page_ext)) continue; - /* Maybe overraping zone */ + /* Maybe overlapping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; /* Found early allocated page */ - set_page_owner(page, 0, 0); + __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; } + cond_resched(); } pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", @@ -583,15 +602,12 @@ static void init_zones_in_node(pg_data_t *pgdat) { struct zone *zone; struct zone *node_zones = pgdat->node_zones; - unsigned long flags; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!populated_zone(zone)) continue; - spin_lock_irqsave(&zone->lock, flags); init_pages_in_zone(pgdat, zone); - spin_unlock_irqrestore(&zone->lock, flags); } } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 8ec6ba230bb9..6a03946469a9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -48,6 +48,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) if (!is_swap_pte(*pvmw->pte)) return false; entry = pte_to_swp_entry(*pvmw->pte); + if (!is_migration_entry(entry)) return false; if (migration_entry_to_page(entry) - pvmw->page >= @@ -60,6 +61,15 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) WARN_ON_ONCE(1); #endif } else { + if (is_swap_pte(*pvmw->pte)) { + swp_entry_t entry; + + entry = pte_to_swp_entry(*pvmw->pte); + if (is_device_private_entry(entry) && + device_private_entry_to_page(entry) == pvmw->page) + return true; + } + if (!pte_present(*pvmw->pte)) return false; @@ -138,16 +148,28 @@ restart: if (!pud_present(*pud)) return false; pvmw->pmd = pmd_offset(pud, pvmw->address); - if (pmd_trans_huge(*pvmw->pmd)) { + if (pmd_trans_huge(*pvmw->pmd) || is_pmd_migration_entry(*pvmw->pmd)) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); - if (!pmd_present(*pvmw->pmd)) - return not_found(pvmw); if (likely(pmd_trans_huge(*pvmw->pmd))) { if (pvmw->flags & PVMW_MIGRATION) return not_found(pvmw); if (pmd_page(*pvmw->pmd) != page) return not_found(pvmw); return true; + } else if (!pmd_present(*pvmw->pmd)) { + if (thp_migration_supported()) { + if (!(pvmw->flags & PVMW_MIGRATION)) + return not_found(pvmw); + if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) { + swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd); + + if (migration_entry_to_page(entry) != page) + return not_found(pvmw); + return true; + } + } else + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + return not_found(pvmw); } else { /* THP pmd was split under us: handle on pte level */ spin_unlock(pvmw->ptl); diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index cd2442e13d8f..7065faf74b46 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -4,6 +4,22 @@ #include <linux/types.h> #include <linux/percpu.h> +/* + * pcpu_block_md is the metadata block struct. + * Each chunk's bitmap is split into a number of full blocks. + * All units are in terms of bits. + */ +struct pcpu_block_md { + int contig_hint; /* contig hint for block */ + int contig_hint_start; /* block relative starting + position of the contig hint */ + int left_free; /* size of free space along + the left side of the block */ + int right_free; /* size of free space along + the right side of the block */ + int first_free; /* block position of first free */ +}; + struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ @@ -11,24 +27,29 @@ struct pcpu_chunk { #endif struct list_head list; /* linked to pcpu_slot lists */ - int free_size; /* free bytes in the chunk */ - int contig_hint; /* max contiguous size hint */ + int free_bytes; /* free bytes in the chunk */ + int contig_bits; /* max contiguous size hint */ + int contig_bits_start; /* contig_bits starting + offset */ void *base_addr; /* base address of this chunk */ - int map_used; /* # of map entries used before the sentry */ - int map_alloc; /* # of map entries allocated */ - int *map; /* allocation map */ - struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ + unsigned long *alloc_map; /* allocation map */ + unsigned long *bound_map; /* boundary map */ + struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ - int first_free; /* no free below this */ + int first_bit; /* no free below this */ bool immutable; /* no [de]population allowed */ - bool has_reserved; /* Indicates if chunk has reserved space - at the beginning. Reserved chunk will - contain reservation for static chunk. - Dynamic chunk will contain reservation - for static and reserved chunks. */ + int start_offset; /* the overlap with the previous + region to have a page aligned + base_addr */ + int end_offset; /* additional area required to + have the region end page + aligned */ + + int nr_pages; /* # of pages served by this chunk */ int nr_populated; /* # of populated pages */ + int nr_empty_pop_pages; /* # of empty populated pages */ unsigned long populated[]; /* populated bitmap */ }; @@ -36,10 +57,47 @@ extern spinlock_t pcpu_lock; extern struct list_head *pcpu_slot; extern int pcpu_nr_slots; +extern int pcpu_nr_empty_pop_pages; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; +/** + * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks + * @chunk: chunk of interest + * + * This conversion is from the number of physical pages that the chunk + * serves to the number of bitmap blocks used. + */ +static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk) +{ + return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE; +} + +/** + * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap + * @pages: number of physical pages + * + * This conversion is from physical pages to the number of bits + * required in the bitmap. + */ +static inline int pcpu_nr_pages_to_map_bits(int pages) +{ + return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; +} + +/** + * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap + * @chunk: chunk of interest + * + * This conversion is from the number of physical pages that the chunk + * serves to the number of bits in the bitmap. + */ +static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) +{ + return pcpu_nr_pages_to_map_bits(chunk->nr_pages); +} + #ifdef CONFIG_PERCPU_STATS #include <linux/spinlock.h> diff --git a/mm/percpu-km.c b/mm/percpu-km.c index eb58aa4c0997..d2a76642c4ae 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; spin_lock_irq(&pcpu_lock); - pcpu_chunk_populated(chunk, 0, nr_pages); + pcpu_chunk_populated(chunk, 0, nr_pages, false); spin_unlock_irq(&pcpu_lock); pcpu_stats_chunk_alloc(); diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 03524a56eeff..6142484e88f7 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -18,7 +18,7 @@ #include "percpu-internal.h" #define P(X, Y) \ - seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y) + seq_printf(m, " %-20s: %12lld\n", X, (long long int)Y) struct percpu_stats pcpu_stats; struct pcpu_alloc_info pcpu_stats_ai; @@ -29,64 +29,85 @@ static int cmpint(const void *a, const void *b) } /* - * Iterates over all chunks to find the max # of map entries used. + * Iterates over all chunks to find the max nr_alloc entries. */ -static int find_max_map_used(void) +static int find_max_nr_alloc(void) { struct pcpu_chunk *chunk; - int slot, max_map_used; + int slot, max_nr_alloc; - max_map_used = 0; + max_nr_alloc = 0; for (slot = 0; slot < pcpu_nr_slots; slot++) list_for_each_entry(chunk, &pcpu_slot[slot], list) - max_map_used = max(max_map_used, chunk->map_used); + max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc); - return max_map_used; + return max_nr_alloc; } /* * Prints out chunk state. Fragmentation is considered between * the beginning of the chunk to the last allocation. + * + * All statistics are in bytes unless stated otherwise. */ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, - void *buffer) + int *buffer) { - int i, s_index, last_alloc, alloc_sign, as_len; + int i, last_alloc, as_len, start, end; int *alloc_sizes, *p; /* statistics */ int sum_frag = 0, max_frag = 0; int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0; alloc_sizes = buffer; - s_index = chunk->has_reserved ? 1 : 0; - - /* find last allocation */ - last_alloc = -1; - for (i = chunk->map_used - 1; i >= s_index; i--) { - if (chunk->map[i] & 1) { - last_alloc = i; - break; - } - } - /* if the chunk is not empty - ignoring reserve */ - if (last_alloc >= s_index) { - as_len = last_alloc + 1 - s_index; - - /* - * Iterate through chunk map computing size info. - * The first bit is overloaded to be a used flag. - * negative = free space, positive = allocated - */ - for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) { - alloc_sign = (*p & 1) ? 1 : -1; - alloc_sizes[i] = alloc_sign * - ((p[1] & ~1) - (p[0] & ~1)); + /* + * find_last_bit returns the start value if nothing found. + * Therefore, we must determine if it is a failure of find_last_bit + * and set the appropriate value. + */ + last_alloc = find_last_bit(chunk->alloc_map, + pcpu_chunk_map_bits(chunk) - + chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1); + last_alloc = test_bit(last_alloc, chunk->alloc_map) ? + last_alloc + 1 : 0; + + as_len = 0; + start = chunk->start_offset; + + /* + * If a bit is set in the allocation map, the bound_map identifies + * where the allocation ends. If the allocation is not set, the + * bound_map does not identify free areas as it is only kept accurate + * on allocation, not free. + * + * Positive values are allocations and negative values are free + * fragments. + */ + while (start < last_alloc) { + if (test_bit(start, chunk->alloc_map)) { + end = find_next_bit(chunk->bound_map, last_alloc, + start + 1); + alloc_sizes[as_len] = 1; + } else { + end = find_next_bit(chunk->alloc_map, last_alloc, + start + 1); + alloc_sizes[as_len] = -1; } - sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL); + alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE; + + start = end; + } + + /* + * The negative values are free fragments and thus sorting gives the + * free fragments at the beginning in largest first order. + */ + if (as_len > 0) { + sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL); - /* Iterate through the unallocated fragements. */ + /* iterate through the unallocated fragments */ for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) { sum_frag -= *p; max_frag = max(max_frag, -1 * (*p)); @@ -99,8 +120,10 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, P("nr_alloc", chunk->nr_alloc); P("max_alloc_size", chunk->max_alloc_size); - P("free_size", chunk->free_size); - P("contig_hint", chunk->contig_hint); + P("empty_pop_pages", chunk->nr_empty_pop_pages); + P("first_bit", chunk->first_bit); + P("free_bytes", chunk->free_bytes); + P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE); P("sum_frag", sum_frag); P("max_frag", max_frag); P("cur_min_alloc", cur_min_alloc); @@ -112,29 +135,30 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, static int percpu_stats_show(struct seq_file *m, void *v) { struct pcpu_chunk *chunk; - int slot, max_map_used; - void *buffer; + int slot, max_nr_alloc; + int *buffer; alloc_buffer: spin_lock_irq(&pcpu_lock); - max_map_used = find_max_map_used(); + max_nr_alloc = find_max_nr_alloc(); spin_unlock_irq(&pcpu_lock); - buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0])); + /* there can be at most this many free and allocated fragments */ + buffer = vmalloc((2 * max_nr_alloc + 1) * sizeof(int)); if (!buffer) return -ENOMEM; spin_lock_irq(&pcpu_lock); /* if the buffer allocated earlier is too small */ - if (max_map_used < find_max_map_used()) { + if (max_nr_alloc < find_max_nr_alloc()) { spin_unlock_irq(&pcpu_lock); vfree(buffer); goto alloc_buffer; } #define PL(X) \ - seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X) + seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X) seq_printf(m, "Percpu Memory Statistics\n" @@ -151,7 +175,7 @@ alloc_buffer: #undef PL #define PU(X) \ - seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X) + seq_printf(m, " %-20s: %12llu\n", #X, (unsigned long long)pcpu_stats.X) seq_printf(m, "Global Stats:\n" @@ -164,6 +188,7 @@ alloc_buffer: PU(nr_max_chunks); PU(min_alloc_size); PU(max_alloc_size); + P("empty_pop_pages", pcpu_nr_empty_pop_pages); seq_putc(m, '\n'); #undef PU diff --git a/mm/percpu.c b/mm/percpu.c index bd4130a69bbc..59d44d61f5f1 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -4,44 +4,53 @@ * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo <tj@kernel.org> * - * This file is released under the GPLv2. + * Copyright (C) 2017 Facebook Inc. + * Copyright (C) 2017 Dennis Zhou <dennisszhou@gmail.com> * - * This is percpu allocator which can handle both static and dynamic - * areas. Percpu areas are allocated in chunks. Each chunk is - * consisted of boot-time determined number of units and the first - * chunk is used for static percpu variables in the kernel image - * (special boot time alloc/init handling necessary as these areas - * need to be brought up before allocation services are running). - * Unit grows as necessary and all units grow or shrink in unison. - * When a chunk is filled up, another chunk is allocated. + * This file is released under the GPLv2 license. + * + * The percpu allocator handles both static and dynamic areas. Percpu + * areas are allocated in chunks which are divided into units. There is + * a 1-to-1 mapping for units to possible cpus. These units are grouped + * based on NUMA properties of the machine. * * c0 c1 c2 * ------------------- ------------------- ------------ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u * ------------------- ...... ------------------- .... ------------ * - * Allocation is done in offset-size areas of single unit space. Ie, - * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, - * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to - * cpus. On NUMA, the mapping can be non-linear and even sparse. - * Percpu access can be done by configuring percpu base registers - * according to cpu to unit mapping and pcpu_unit_size. - * - * There are usually many small percpu allocations many of them being - * as small as 4 bytes. The allocator organizes chunks into lists - * according to free size and tries to allocate from the fullest one. - * Each chunk keeps the maximum contiguous area size hint which is - * guaranteed to be equal to or larger than the maximum contiguous - * area in the chunk. This helps the allocator not to iterate the - * chunk maps unnecessarily. - * - * Allocation state in each chunk is kept using an array of integers - * on chunk->map. A positive value in the map represents a free - * region and negative allocated. Allocation inside a chunk is done - * by scanning this map sequentially and serving the first matching - * entry. This is mostly copied from the percpu_modalloc() allocator. - * Chunks can be determined from the address using the index field - * in the page struct. The index field contains a pointer to the chunk. + * Allocation is done by offsets into a unit's address space. Ie., an + * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0, + * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear + * and even sparse. Access is handled by configuring percpu base + * registers according to the cpu to unit mappings and offsetting the + * base address using pcpu_unit_size. + * + * There is special consideration for the first chunk which must handle + * the static percpu variables in the kernel image as allocation services + * are not online yet. In short, the first chunk is structured like so: + * + * <Static | [Reserved] | Dynamic> + * + * The static data is copied from the original section managed by the + * linker. The reserved section, if non-zero, primarily manages static + * percpu variables from kernel modules. Finally, the dynamic section + * takes care of normal allocations. + * + * The allocator organizes chunks into lists according to free size and + * tries to allocate from the fullest chunk first. Each chunk is managed + * by a bitmap with metadata blocks. The allocation map is updated on + * every allocation and free to reflect the current state while the boundary + * map is only updated on allocation. Each metadata block contains + * information to help mitigate the need to iterate over large portions + * of the bitmap. The reverse mapping from page to chunk is stored in + * the page's index. Lastly, units are lazily backed and grow in unison. + * + * There is a unique conversion that goes on here between bytes and bits. + * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk + * tracks the number of pages it is responsible for in nr_pages. Helper + * functions are used to convert from between the bytes, bits, and blocks. + * All hints are managed in bits unless explicitly stated. * * To use this allocator, arch code should do the following: * @@ -58,6 +67,7 @@ #include <linux/bitmap.h> #include <linux/bootmem.h> #include <linux/err.h> +#include <linux/lcm.h> #include <linux/list.h> #include <linux/log2.h> #include <linux/mm.h> @@ -81,10 +91,9 @@ #include "percpu-internal.h" -#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ -#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ -#define PCPU_ATOMIC_MAP_MARGIN_LOW 32 -#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */ +#define PCPU_SLOT_BASE_SHIFT 5 + #define PCPU_EMPTY_POP_PAGES_LOW 2 #define PCPU_EMPTY_POP_PAGES_HIGH 4 @@ -140,13 +149,10 @@ struct pcpu_chunk *pcpu_first_chunk __ro_after_init; /* * Optional reserved chunk. This chunk reserves part of the first - * chunk and serves it for reserved allocations. The amount of - * reserved offset is in pcpu_reserved_chunk_limit. When reserved - * area doesn't exist, the following variables contain NULL and 0 - * respectively. + * chunk and serves it for reserved allocations. When the reserved + * region doesn't exist, the following variable is NULL. */ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; -static int pcpu_reserved_chunk_limit __ro_after_init; DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ @@ -160,7 +166,7 @@ static LIST_HEAD(pcpu_map_extend_chunks); * The number of empty populated pages, protected by pcpu_lock. The * reserved chunk doesn't contribute to the count. */ -static int pcpu_nr_empty_pop_pages; +int pcpu_nr_empty_pop_pages; /* * Balance work is used to populate or destroy chunks asynchronously. We @@ -179,19 +185,26 @@ static void pcpu_schedule_balance_work(void) schedule_work(&pcpu_balance_work); } -static bool pcpu_addr_in_first_chunk(void *addr) +/** + * pcpu_addr_in_chunk - check if the address is served from this chunk + * @chunk: chunk of interest + * @addr: percpu address + * + * RETURNS: + * True if the address is served from this chunk. + */ +static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr) { - void *first_start = pcpu_first_chunk->base_addr; + void *start_addr, *end_addr; - return addr >= first_start && addr < first_start + pcpu_unit_size; -} + if (!chunk) + return false; -static bool pcpu_addr_in_reserved_chunk(void *addr) -{ - void *first_start = pcpu_first_chunk->base_addr; + start_addr = chunk->base_addr + chunk->start_offset; + end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE - + chunk->end_offset; - return addr >= first_start && - addr < first_start + pcpu_reserved_chunk_limit; + return addr >= start_addr && addr < end_addr; } static int __pcpu_size_to_slot(int size) @@ -209,10 +222,10 @@ static int pcpu_size_to_slot(int size) static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { - if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) + if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0) return 0; - return pcpu_size_to_slot(chunk->free_size); + return pcpu_size_to_slot(chunk->free_bytes); } /* set the pointer to a chunk in a page struct */ @@ -232,42 +245,200 @@ static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } +static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx) +{ + return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT); +} + static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { - return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + - (page_idx << PAGE_SHIFT); + return (unsigned long)chunk->base_addr + + pcpu_unit_page_offset(cpu, page_idx); } -static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk, - int *rs, int *re, int end) +static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end) { - *rs = find_next_zero_bit(chunk->populated, end, *rs); - *re = find_next_bit(chunk->populated, end, *rs + 1); + *rs = find_next_zero_bit(bitmap, end, *rs); + *re = find_next_bit(bitmap, end, *rs + 1); } -static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, - int *rs, int *re, int end) +static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end) { - *rs = find_next_bit(chunk->populated, end, *rs); - *re = find_next_zero_bit(chunk->populated, end, *rs + 1); + *rs = find_next_bit(bitmap, end, *rs); + *re = find_next_zero_bit(bitmap, end, *rs + 1); } /* - * (Un)populated page region iterators. Iterate over (un)populated - * page regions between @start and @end in @chunk. @rs and @re should - * be integer variables and will be set to start and end page index of - * the current region. + * Bitmap region iterators. Iterates over the bitmap between + * [@start, @end) in @chunk. @rs and @re should be integer variables + * and will be set to start and end index of the current free region. + */ +#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end))) + +#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end))) + +/* + * The following are helper functions to help access bitmaps and convert + * between bitmap offsets to address offsets. + */ +static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index) +{ + return chunk->alloc_map + + (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG); +} + +static unsigned long pcpu_off_to_block_index(int off) +{ + return off / PCPU_BITMAP_BLOCK_BITS; +} + +static unsigned long pcpu_off_to_block_off(int off) +{ + return off & (PCPU_BITMAP_BLOCK_BITS - 1); +} + +static unsigned long pcpu_block_off_to_off(int index, int off) +{ + return index * PCPU_BITMAP_BLOCK_BITS + off; +} + +/** + * pcpu_next_md_free_region - finds the next hint free area + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of free area + * + * Helper function for pcpu_for_each_md_free_region. It checks + * block->contig_hint and performs aggregation across blocks to find the + * next hint. It modifies bit_off and bits in-place to be consumed in the + * loop. + */ +static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, + int *bits) +{ + int i = pcpu_off_to_block_index(*bit_off); + int block_off = pcpu_off_to_block_off(*bit_off); + struct pcpu_block_md *block; + + *bits = 0; + for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); + block++, i++) { + /* handles contig area across blocks */ + if (*bits) { + *bits += block->left_free; + if (block->left_free == PCPU_BITMAP_BLOCK_BITS) + continue; + return; + } + + /* + * This checks three things. First is there a contig_hint to + * check. Second, have we checked this hint before by + * comparing the block_off. Third, is this the same as the + * right contig hint. In the last case, it spills over into + * the next block and should be handled by the contig area + * across blocks code. + */ + *bits = block->contig_hint; + if (*bits && block->contig_hint_start >= block_off && + *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) { + *bit_off = pcpu_block_off_to_off(i, + block->contig_hint_start); + return; + } + + *bits = block->right_free; + *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free; + } +} + +/** + * pcpu_next_fit_region - finds fit areas for a given allocation request + * @chunk: chunk of interest + * @alloc_bits: size of allocation + * @align: alignment of area (max PAGE_SIZE) + * @bit_off: chunk offset + * @bits: size of free area + * + * Finds the next free region that is viable for use with a given size and + * alignment. This only returns if there is a valid area to be used for this + * allocation. block->first_free is returned if the allocation request fits + * within the block to see if the request can be fulfilled prior to the contig + * hint. */ -#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ - for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) +static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, + int align, int *bit_off, int *bits) +{ + int i = pcpu_off_to_block_index(*bit_off); + int block_off = pcpu_off_to_block_off(*bit_off); + struct pcpu_block_md *block; + + *bits = 0; + for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); + block++, i++) { + /* handles contig area across blocks */ + if (*bits) { + *bits += block->left_free; + if (*bits >= alloc_bits) + return; + if (block->left_free == PCPU_BITMAP_BLOCK_BITS) + continue; + } + + /* check block->contig_hint */ + *bits = ALIGN(block->contig_hint_start, align) - + block->contig_hint_start; + /* + * This uses the block offset to determine if this has been + * checked in the prior iteration. + */ + if (block->contig_hint && + block->contig_hint_start >= block_off && + block->contig_hint >= *bits + alloc_bits) { + *bits += alloc_bits + block->contig_hint_start - + block->first_free; + *bit_off = pcpu_block_off_to_off(i, block->first_free); + return; + } + + *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free, + align); + *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off; + *bit_off = pcpu_block_off_to_off(i, *bit_off); + if (*bits >= alloc_bits) + return; + } -#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ - for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) + /* no valid offsets were found - fail condition */ + *bit_off = pcpu_chunk_map_bits(chunk); +} + +/* + * Metadata free area iterators. These perform aggregation of free areas + * based on the metadata blocks and return the offset @bit_off and size in + * bits of the free area @bits. pcpu_for_each_fit_region only returns when + * a fit is found for the allocation request. + */ +#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \ + for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \ + (bit_off) < pcpu_chunk_map_bits((chunk)); \ + (bit_off) += (bits) + 1, \ + pcpu_next_md_free_region((chunk), &(bit_off), &(bits))) + +#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \ + for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ + &(bits)); \ + (bit_off) < pcpu_chunk_map_bits((chunk)); \ + (bit_off) += (bits), \ + pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ + &(bits))) /** * pcpu_mem_zalloc - allocate memory @@ -306,38 +477,6 @@ static void pcpu_mem_free(void *ptr) } /** - * pcpu_count_occupied_pages - count the number of pages an area occupies - * @chunk: chunk of interest - * @i: index of the area in question - * - * Count the number of pages chunk's @i'th area occupies. When the area's - * start and/or end address isn't aligned to page boundary, the straddled - * page is included in the count iff the rest of the page is free. - */ -static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) -{ - int off = chunk->map[i] & ~1; - int end = chunk->map[i + 1] & ~1; - - if (!PAGE_ALIGNED(off) && i > 0) { - int prev = chunk->map[i - 1]; - - if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) - off = round_down(off, PAGE_SIZE); - } - - if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { - int next = chunk->map[i + 1]; - int nend = chunk->map[i + 2] & ~1; - - if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) - end = round_up(end, PAGE_SIZE); - } - - return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); -} - -/** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on @@ -363,383 +502,706 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) } /** - * pcpu_need_to_extend - determine whether chunk area map needs to be extended + * pcpu_cnt_pop_pages- counts populated backing pages in range * @chunk: chunk of interest - * @is_atomic: the allocation context + * @bit_off: start offset + * @bits: size of area to check * - * Determine whether area map of @chunk needs to be extended. If - * @is_atomic, only the amount necessary for a new allocation is - * considered; however, async extension is scheduled if the left amount is - * low. If !@is_atomic, it aims for more empty space. Combined, this - * ensures that the map is likely to have enough available space to - * accomodate atomic allocations which can't extend maps directly. - * - * CONTEXT: - * pcpu_lock. + * Calculates the number of populated pages in the region + * [page_start, page_end). This keeps track of how many empty populated + * pages are available and decide if async work should be scheduled. * * RETURNS: - * New target map allocation length if extension is necessary, 0 - * otherwise. + * The nr of populated pages. */ -static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) +static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off, + int bits) { - int margin, new_alloc; - - lockdep_assert_held(&pcpu_lock); - - if (is_atomic) { - margin = 3; + int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE); + int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); - if (chunk->map_alloc < - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) { - if (list_empty(&chunk->map_extend_list)) { - list_add_tail(&chunk->map_extend_list, - &pcpu_map_extend_chunks); - pcpu_schedule_balance_work(); - } - } - } else { - margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; - } - - if (chunk->map_alloc >= chunk->map_used + margin) + if (page_start >= page_end) return 0; - new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + margin) - new_alloc *= 2; - - return new_alloc; + /* + * bitmap_weight counts the number of bits set in a bitmap up to + * the specified number of bits. This is counting the populated + * pages up to page_end and then subtracting the populated pages + * up to page_start to count the populated pages in + * [page_start, page_end). + */ + return bitmap_weight(chunk->populated, page_end) - + bitmap_weight(chunk->populated, page_start); } /** - * pcpu_extend_area_map - extend area map of a chunk + * pcpu_chunk_update - updates the chunk metadata given a free area * @chunk: chunk of interest - * @new_alloc: new target allocation length of the area map + * @bit_off: chunk offset + * @bits: size of free area * - * Extend area map of @chunk to have @new_alloc entries. + * This updates the chunk's contig hint and starting offset given a free area. + * Choose the best starting offset if the contig hint is equal. + */ +static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits) +{ + if (bits > chunk->contig_bits) { + chunk->contig_bits_start = bit_off; + chunk->contig_bits = bits; + } else if (bits == chunk->contig_bits && chunk->contig_bits_start && + (!bit_off || + __ffs(bit_off) > __ffs(chunk->contig_bits_start))) { + /* use the start with the best alignment */ + chunk->contig_bits_start = bit_off; + } +} + +/** + * pcpu_chunk_refresh_hint - updates metadata about a chunk + * @chunk: chunk of interest * - * CONTEXT: - * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock. + * Iterates over the metadata blocks to find the largest contig area. + * It also counts the populated pages and uses the delta to update the + * global count. * - * RETURNS: - * 0 on success, -errno on failure. + * Updates: + * chunk->contig_bits + * chunk->contig_bits_start + * nr_empty_pop_pages (chunk and global) */ -static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) +static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) { - int *old = NULL, *new = NULL; - size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); - unsigned long flags; + int bit_off, bits, nr_empty_pop_pages; - lockdep_assert_held(&pcpu_alloc_mutex); + /* clear metadata */ + chunk->contig_bits = 0; - new = pcpu_mem_zalloc(new_size); - if (!new) - return -ENOMEM; + bit_off = chunk->first_bit; + bits = nr_empty_pop_pages = 0; + pcpu_for_each_md_free_region(chunk, bit_off, bits) { + pcpu_chunk_update(chunk, bit_off, bits); - /* acquire pcpu_lock and switch to new area map */ - spin_lock_irqsave(&pcpu_lock, flags); + nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits); + } - if (new_alloc <= chunk->map_alloc) - goto out_unlock; + /* + * Keep track of nr_empty_pop_pages. + * + * The chunk maintains the previous number of free pages it held, + * so the delta is used to update the global counter. The reserved + * chunk is not part of the free page count as they are populated + * at init and are special to serving reserved allocations. + */ + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += + (nr_empty_pop_pages - chunk->nr_empty_pop_pages); - old_size = chunk->map_alloc * sizeof(chunk->map[0]); - old = chunk->map; + chunk->nr_empty_pop_pages = nr_empty_pop_pages; +} - memcpy(new, old, old_size); +/** + * pcpu_block_update - updates a block given a free area + * @block: block of interest + * @start: start offset in block + * @end: end offset in block + * + * Updates a block given a known free area. The region [start, end) is + * expected to be the entirety of the free area within a block. Chooses + * the best starting offset if the contig hints are equal. + */ +static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) +{ + int contig = end - start; + + block->first_free = min(block->first_free, start); + if (start == 0) + block->left_free = contig; + + if (end == PCPU_BITMAP_BLOCK_BITS) + block->right_free = contig; + + if (contig > block->contig_hint) { + block->contig_hint_start = start; + block->contig_hint = contig; + } else if (block->contig_hint_start && contig == block->contig_hint && + (!start || __ffs(start) > __ffs(block->contig_hint_start))) { + /* use the start with the best alignment */ + block->contig_hint_start = start; + } +} - chunk->map_alloc = new_alloc; - chunk->map = new; - new = NULL; +/** + * pcpu_block_refresh_hint + * @chunk: chunk of interest + * @index: index of the metadata block + * + * Scans over the block beginning at first_free and updates the block + * metadata accordingly. + */ +static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) +{ + struct pcpu_block_md *block = chunk->md_blocks + index; + unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); + int rs, re; /* region start, region end */ + + /* clear hints */ + block->contig_hint = 0; + block->left_free = block->right_free = 0; + + /* iterate over free areas and update the contig hints */ + pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free, + PCPU_BITMAP_BLOCK_BITS) { + pcpu_block_update(block, rs, re); + } +} -out_unlock: - spin_unlock_irqrestore(&pcpu_lock, flags); +/** + * pcpu_block_update_hint_alloc - update hint on allocation path + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of request + * + * Updates metadata for the allocation path. The metadata only has to be + * refreshed by a full scan iff the chunk's contig hint is broken. Block level + * scans are required if the block's contig hint is broken. + */ +static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, + int bits) +{ + struct pcpu_block_md *s_block, *e_block, *block; + int s_index, e_index; /* block indexes of the freed allocation */ + int s_off, e_off; /* block offsets of the freed allocation */ /* - * pcpu_mem_free() might end up calling vfree() which uses - * IRQ-unsafe lock and thus can't be called under pcpu_lock. + * Calculate per block offsets. + * The calculation uses an inclusive range, but the resulting offsets + * are [start, end). e_index always points to the last block in the + * range. */ - pcpu_mem_free(old); - pcpu_mem_free(new); + s_index = pcpu_off_to_block_index(bit_off); + e_index = pcpu_off_to_block_index(bit_off + bits - 1); + s_off = pcpu_off_to_block_off(bit_off); + e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; - return 0; + s_block = chunk->md_blocks + s_index; + e_block = chunk->md_blocks + e_index; + + /* + * Update s_block. + * block->first_free must be updated if the allocation takes its place. + * If the allocation breaks the contig_hint, a scan is required to + * restore this hint. + */ + if (s_off == s_block->first_free) + s_block->first_free = find_next_zero_bit( + pcpu_index_alloc_map(chunk, s_index), + PCPU_BITMAP_BLOCK_BITS, + s_off + bits); + + if (s_off >= s_block->contig_hint_start && + s_off < s_block->contig_hint_start + s_block->contig_hint) { + /* block contig hint is broken - scan to fix it */ + pcpu_block_refresh_hint(chunk, s_index); + } else { + /* update left and right contig manually */ + s_block->left_free = min(s_block->left_free, s_off); + if (s_index == e_index) + s_block->right_free = min_t(int, s_block->right_free, + PCPU_BITMAP_BLOCK_BITS - e_off); + else + s_block->right_free = 0; + } + + /* + * Update e_block. + */ + if (s_index != e_index) { + /* + * When the allocation is across blocks, the end is along + * the left part of the e_block. + */ + e_block->first_free = find_next_zero_bit( + pcpu_index_alloc_map(chunk, e_index), + PCPU_BITMAP_BLOCK_BITS, e_off); + + if (e_off == PCPU_BITMAP_BLOCK_BITS) { + /* reset the block */ + e_block++; + } else { + if (e_off > e_block->contig_hint_start) { + /* contig hint is broken - scan to fix it */ + pcpu_block_refresh_hint(chunk, e_index); + } else { + e_block->left_free = 0; + e_block->right_free = + min_t(int, e_block->right_free, + PCPU_BITMAP_BLOCK_BITS - e_off); + } + } + + /* update in-between md_blocks */ + for (block = s_block + 1; block < e_block; block++) { + block->contig_hint = 0; + block->left_free = 0; + block->right_free = 0; + } + } + + /* + * The only time a full chunk scan is required is if the chunk + * contig hint is broken. Otherwise, it means a smaller space + * was used and therefore the chunk contig hint is still correct. + */ + if (bit_off >= chunk->contig_bits_start && + bit_off < chunk->contig_bits_start + chunk->contig_bits) + pcpu_chunk_refresh_hint(chunk); } /** - * pcpu_fit_in_area - try to fit the requested allocation in a candidate area - * @chunk: chunk the candidate area belongs to - * @off: the offset to the start of the candidate area - * @this_size: the size of the candidate area - * @size: the size of the target allocation - * @align: the alignment of the target allocation - * @pop_only: only allocate from already populated region - * - * We're trying to allocate @size bytes aligned at @align. @chunk's area - * at @off sized @this_size is a candidate. This function determines - * whether the target allocation fits in the candidate area and returns the - * number of bytes to pad after @off. If the target area doesn't fit, -1 - * is returned. - * - * If @pop_only is %true, this function only considers the already - * populated part of the candidate area. + * pcpu_block_update_hint_free - updates the block hints on the free path + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of request + * + * Updates metadata for the allocation path. This avoids a blind block + * refresh by making use of the block contig hints. If this fails, it scans + * forward and backward to determine the extent of the free area. This is + * capped at the boundary of blocks. + * + * A chunk update is triggered if a page becomes free, a block becomes free, + * or the free spans across blocks. This tradeoff is to minimize iterating + * over the block metadata to update chunk->contig_bits. chunk->contig_bits + * may be off by up to a page, but it will never be more than the available + * space. If the contig hint is contained in one block, it will be accurate. */ -static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, - int size, int align, bool pop_only) +static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, + int bits) { - int cand_off = off; - - while (true) { - int head = ALIGN(cand_off, align) - off; - int page_start, page_end, rs, re; + struct pcpu_block_md *s_block, *e_block, *block; + int s_index, e_index; /* block indexes of the freed allocation */ + int s_off, e_off; /* block offsets of the freed allocation */ + int start, end; /* start and end of the whole free area */ - if (this_size < head + size) - return -1; + /* + * Calculate per block offsets. + * The calculation uses an inclusive range, but the resulting offsets + * are [start, end). e_index always points to the last block in the + * range. + */ + s_index = pcpu_off_to_block_index(bit_off); + e_index = pcpu_off_to_block_index(bit_off + bits - 1); + s_off = pcpu_off_to_block_off(bit_off); + e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; - if (!pop_only) - return head; + s_block = chunk->md_blocks + s_index; + e_block = chunk->md_blocks + e_index; + /* + * Check if the freed area aligns with the block->contig_hint. + * If it does, then the scan to find the beginning/end of the + * larger free area can be avoided. + * + * start and end refer to beginning and end of the free area + * within each their respective blocks. This is not necessarily + * the entire free area as it may span blocks past the beginning + * or end of the block. + */ + start = s_off; + if (s_off == s_block->contig_hint + s_block->contig_hint_start) { + start = s_block->contig_hint_start; + } else { /* - * If the first unpopulated page is beyond the end of the - * allocation, the whole allocation is populated; - * otherwise, retry from the end of the unpopulated area. + * Scan backwards to find the extent of the free area. + * find_last_bit returns the starting bit, so if the start bit + * is returned, that means there was no last bit and the + * remainder of the chunk is free. */ - page_start = PFN_DOWN(head + off); - page_end = PFN_UP(head + off + size); - - rs = page_start; - pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); - if (rs >= page_end) - return head; - cand_off = re * PAGE_SIZE; + int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), + start); + start = (start == l_bit) ? 0 : l_bit + 1; + } + + end = e_off; + if (e_off == e_block->contig_hint_start) + end = e_block->contig_hint_start + e_block->contig_hint; + else + end = find_next_bit(pcpu_index_alloc_map(chunk, e_index), + PCPU_BITMAP_BLOCK_BITS, end); + + /* update s_block */ + e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; + pcpu_block_update(s_block, start, e_off); + + /* freeing in the same block */ + if (s_index != e_index) { + /* update e_block */ + pcpu_block_update(e_block, 0, end); + + /* reset md_blocks in the middle */ + for (block = s_block + 1; block < e_block; block++) { + block->first_free = 0; + block->contig_hint_start = 0; + block->contig_hint = PCPU_BITMAP_BLOCK_BITS; + block->left_free = PCPU_BITMAP_BLOCK_BITS; + block->right_free = PCPU_BITMAP_BLOCK_BITS; + } } + + /* + * Refresh chunk metadata when the free makes a page free, a block + * free, or spans across blocks. The contig hint may be off by up to + * a page, but if the hint is contained in a block, it will be accurate + * with the else condition below. + */ + if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) > + ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) || + s_index != e_index) + pcpu_chunk_refresh_hint(chunk); + else + pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start), + s_block->contig_hint); } /** - * pcpu_alloc_area - allocate area from a pcpu_chunk + * pcpu_is_populated - determines if the region is populated * @chunk: chunk of interest - * @size: wanted size in bytes - * @align: wanted align - * @pop_only: allocate only from the populated area - * @occ_pages_p: out param for the number of pages the area occupies - * - * Try to allocate @size bytes area aligned at @align from @chunk. - * Note that this function only allocates the offset. It doesn't - * populate or map the area. + * @bit_off: chunk offset + * @bits: size of area + * @next_off: return value for the next offset to start searching * - * @chunk->map must have at least two free slots. + * For atomic allocations, check if the backing pages are populated. * - * CONTEXT: - * pcpu_lock. + * RETURNS: + * Bool if the backing pages are populated. + * next_index is to skip over unpopulated blocks in pcpu_find_block_fit. + */ +static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, + int *next_off) +{ + int page_start, page_end, rs, re; + + page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); + page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); + + rs = page_start; + pcpu_next_unpop(chunk->populated, &rs, &re, page_end); + if (rs >= page_end) + return true; + + *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; + return false; +} + +/** + * pcpu_find_block_fit - finds the block index to start searching + * @chunk: chunk of interest + * @alloc_bits: size of request in allocation units + * @align: alignment of area (max PAGE_SIZE bytes) + * @pop_only: use populated regions only + * + * Given a chunk and an allocation spec, find the offset to begin searching + * for a free region. This iterates over the bitmap metadata blocks to + * find an offset that will be guaranteed to fit the requirements. It is + * not quite first fit as if the allocation does not fit in the contig hint + * of a block or chunk, it is skipped. This errs on the side of caution + * to prevent excess iteration. Poor alignment can cause the allocator to + * skip over blocks and chunks that have valid free areas. * * RETURNS: - * Allocated offset in @chunk on success, -1 if no matching area is - * found. + * The offset in the bitmap to begin searching. + * -1 if no offset is found. */ -static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, - bool pop_only, int *occ_pages_p) +static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, + size_t align, bool pop_only) { - int oslot = pcpu_chunk_slot(chunk); - int max_contig = 0; - int i, off; - bool seen_free = false; - int *p; - - for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) { - int head, tail; - int this_size; - - off = *p; - if (off & 1) - continue; + int bit_off, bits, next_off; - this_size = (p[1] & ~1) - off; + /* + * Check to see if the allocation can fit in the chunk's contig hint. + * This is an optimization to prevent scanning by assuming if it + * cannot fit in the global hint, there is memory pressure and creating + * a new chunk would happen soon. + */ + bit_off = ALIGN(chunk->contig_bits_start, align) - + chunk->contig_bits_start; + if (bit_off + alloc_bits > chunk->contig_bits) + return -1; + + bit_off = chunk->first_bit; + bits = 0; + pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { + if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, + &next_off)) + break; - head = pcpu_fit_in_area(chunk, off, this_size, size, align, - pop_only); - if (head < 0) { - if (!seen_free) { - chunk->first_free = i; - seen_free = true; - } - max_contig = max(this_size, max_contig); - continue; - } + bit_off = next_off; + bits = 0; + } - /* - * If head is small or the previous block is free, - * merge'em. Note that 'small' is defined as smaller - * than sizeof(int), which is very small but isn't too - * uncommon for percpu allocations. - */ - if (head && (head < sizeof(int) || !(p[-1] & 1))) { - *p = off += head; - if (p[-1] & 1) - chunk->free_size -= head; - else - max_contig = max(*p - p[-1], max_contig); - this_size -= head; - head = 0; - } + if (bit_off == pcpu_chunk_map_bits(chunk)) + return -1; - /* if tail is small, just keep it around */ - tail = this_size - head - size; - if (tail < sizeof(int)) { - tail = 0; - size = this_size - head; - } + return bit_off; +} - /* split if warranted */ - if (head || tail) { - int nr_extra = !!head + !!tail; - - /* insert new subblocks */ - memmove(p + nr_extra + 1, p + 1, - sizeof(chunk->map[0]) * (chunk->map_used - i)); - chunk->map_used += nr_extra; - - if (head) { - if (!seen_free) { - chunk->first_free = i; - seen_free = true; - } - *++p = off += head; - ++i; - max_contig = max(head, max_contig); - } - if (tail) { - p[1] = off + size; - max_contig = max(tail, max_contig); - } - } +/** + * pcpu_alloc_area - allocates an area from a pcpu_chunk + * @chunk: chunk of interest + * @alloc_bits: size of request in allocation units + * @align: alignment of area (max PAGE_SIZE) + * @start: bit_off to start searching + * + * This function takes in a @start offset to begin searching to fit an + * allocation of @alloc_bits with alignment @align. It needs to scan + * the allocation map because if it fits within the block's contig hint, + * @start will be block->first_free. This is an attempt to fill the + * allocation prior to breaking the contig hint. The allocation and + * boundary maps are updated accordingly if it confirms a valid + * free area. + * + * RETURNS: + * Allocated addr offset in @chunk on success. + * -1 if no matching area is found. + */ +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, + size_t align, int start) +{ + size_t align_mask = (align) ? (align - 1) : 0; + int bit_off, end, oslot; - if (!seen_free) - chunk->first_free = i + 1; + lockdep_assert_held(&pcpu_lock); - /* update hint and mark allocated */ - if (i + 1 == chunk->map_used) - chunk->contig_hint = max_contig; /* fully scanned */ - else - chunk->contig_hint = max(chunk->contig_hint, - max_contig); + oslot = pcpu_chunk_slot(chunk); - chunk->free_size -= size; - *p |= 1; + /* + * Search to find a fit. + */ + end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS; + bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start, + alloc_bits, align_mask); + if (bit_off >= end) + return -1; - *occ_pages_p = pcpu_count_occupied_pages(chunk, i); - pcpu_chunk_relocate(chunk, oslot); - return off; - } + /* update alloc map */ + bitmap_set(chunk->alloc_map, bit_off, alloc_bits); + + /* update boundary map */ + set_bit(bit_off, chunk->bound_map); + bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1); + set_bit(bit_off + alloc_bits, chunk->bound_map); + + chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; + + /* update first free bit */ + if (bit_off == chunk->first_bit) + chunk->first_bit = find_next_zero_bit( + chunk->alloc_map, + pcpu_chunk_map_bits(chunk), + bit_off + alloc_bits); + + pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits); - chunk->contig_hint = max_contig; /* fully scanned */ pcpu_chunk_relocate(chunk, oslot); - /* tell the upper layer that this chunk has no matching area */ - return -1; + return bit_off * PCPU_MIN_ALLOC_SIZE; } /** - * pcpu_free_area - free area to a pcpu_chunk + * pcpu_free_area - frees the corresponding offset * @chunk: chunk of interest - * @freeme: offset of area to free - * @occ_pages_p: out param for the number of pages the area occupies - * - * Free area starting from @freeme to @chunk. Note that this function - * only modifies the allocation map. It doesn't depopulate or unmap - * the area. + * @off: addr offset into chunk * - * CONTEXT: - * pcpu_lock. + * This function determines the size of an allocation to free using + * the boundary bitmap and clears the allocation map. */ -static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, - int *occ_pages_p) +static void pcpu_free_area(struct pcpu_chunk *chunk, int off) { - int oslot = pcpu_chunk_slot(chunk); - int off = 0; - unsigned i, j; - int to_free = 0; - int *p; + int bit_off, bits, end, oslot; lockdep_assert_held(&pcpu_lock); pcpu_stats_area_dealloc(chunk); - freeme |= 1; /* we are searching for <given offset, in use> pair */ - - i = 0; - j = chunk->map_used; - while (i != j) { - unsigned k = (i + j) / 2; - off = chunk->map[k]; - if (off < freeme) - i = k + 1; - else if (off > freeme) - j = k; - else - i = j = k; + oslot = pcpu_chunk_slot(chunk); + + bit_off = off / PCPU_MIN_ALLOC_SIZE; + + /* find end index */ + end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), + bit_off + 1); + bits = end - bit_off; + bitmap_clear(chunk->alloc_map, bit_off, bits); + + /* update metadata */ + chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE; + + /* update first free bit */ + chunk->first_bit = min(chunk->first_bit, bit_off); + + pcpu_block_update_hint_free(chunk, bit_off, bits); + + pcpu_chunk_relocate(chunk, oslot); +} + +static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) +{ + struct pcpu_block_md *md_block; + + for (md_block = chunk->md_blocks; + md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); + md_block++) { + md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS; + md_block->left_free = PCPU_BITMAP_BLOCK_BITS; + md_block->right_free = PCPU_BITMAP_BLOCK_BITS; } - BUG_ON(off != freeme); +} - if (i < chunk->first_free) - chunk->first_free = i; +/** + * pcpu_alloc_first_chunk - creates chunks that serve the first chunk + * @tmp_addr: the start of the region served + * @map_size: size of the region served + * + * This is responsible for creating the chunks that serve the first chunk. The + * base_addr is page aligned down of @tmp_addr while the region end is page + * aligned up. Offsets are kept track of to determine the region served. All + * this is done to appease the bitmap allocator in avoiding partial blocks. + * + * RETURNS: + * Chunk serving the region at @tmp_addr of @map_size. + */ +static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, + int map_size) +{ + struct pcpu_chunk *chunk; + unsigned long aligned_addr, lcm_align; + int start_offset, offset_bits, region_size, region_bits; - p = chunk->map + i; - *p = off &= ~1; - chunk->free_size += (p[1] & ~1) - off; + /* region calculations */ + aligned_addr = tmp_addr & PAGE_MASK; - *occ_pages_p = pcpu_count_occupied_pages(chunk, i); + start_offset = tmp_addr - aligned_addr; - /* merge with next? */ - if (!(p[1] & 1)) - to_free++; - /* merge with previous? */ - if (i > 0 && !(p[-1] & 1)) { - to_free++; - i--; - p--; + /* + * Align the end of the region with the LCM of PAGE_SIZE and + * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of + * the other. + */ + lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE); + region_size = ALIGN(start_offset + map_size, lcm_align); + + /* allocate chunk */ + chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(region_size >> PAGE_SHIFT), + 0); + + INIT_LIST_HEAD(&chunk->list); + + chunk->base_addr = (void *)aligned_addr; + chunk->start_offset = start_offset; + chunk->end_offset = region_size - chunk->start_offset - map_size; + + chunk->nr_pages = region_size >> PAGE_SHIFT; + region_bits = pcpu_chunk_map_bits(chunk); + + chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) * + sizeof(chunk->alloc_map[0]), 0); + chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) * + sizeof(chunk->bound_map[0]), 0); + chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) * + sizeof(chunk->md_blocks[0]), 0); + pcpu_init_md_blocks(chunk); + + /* manage populated page bitmap */ + chunk->immutable = true; + bitmap_fill(chunk->populated, chunk->nr_pages); + chunk->nr_populated = chunk->nr_pages; + chunk->nr_empty_pop_pages = + pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE, + map_size / PCPU_MIN_ALLOC_SIZE); + + chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE; + chunk->free_bytes = map_size; + + if (chunk->start_offset) { + /* hide the beginning of the bitmap */ + offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE; + bitmap_set(chunk->alloc_map, 0, offset_bits); + set_bit(0, chunk->bound_map); + set_bit(offset_bits, chunk->bound_map); + + chunk->first_bit = offset_bits; + + pcpu_block_update_hint_alloc(chunk, 0, offset_bits); } - if (to_free) { - chunk->map_used -= to_free; - memmove(p + 1, p + 1 + to_free, - (chunk->map_used - i) * sizeof(chunk->map[0])); + + if (chunk->end_offset) { + /* hide the end of the bitmap */ + offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE; + bitmap_set(chunk->alloc_map, + pcpu_chunk_map_bits(chunk) - offset_bits, + offset_bits); + set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE, + chunk->bound_map); + set_bit(region_bits, chunk->bound_map); + + pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk) + - offset_bits, offset_bits); } - chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint); - pcpu_chunk_relocate(chunk, oslot); + return chunk; } static struct pcpu_chunk *pcpu_alloc_chunk(void) { struct pcpu_chunk *chunk; + int region_bits; chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); if (!chunk) return NULL; - chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * - sizeof(chunk->map[0])); - if (!chunk->map) { - pcpu_mem_free(chunk); - return NULL; - } + INIT_LIST_HEAD(&chunk->list); + chunk->nr_pages = pcpu_unit_pages; + region_bits = pcpu_chunk_map_bits(chunk); - chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[0] = 0; - chunk->map[1] = pcpu_unit_size | 1; - chunk->map_used = 1; - chunk->has_reserved = false; + chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * + sizeof(chunk->alloc_map[0])); + if (!chunk->alloc_map) + goto alloc_map_fail; - INIT_LIST_HEAD(&chunk->list); - INIT_LIST_HEAD(&chunk->map_extend_list); - chunk->free_size = pcpu_unit_size; - chunk->contig_hint = pcpu_unit_size; + chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * + sizeof(chunk->bound_map[0])); + if (!chunk->bound_map) + goto bound_map_fail; + + chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * + sizeof(chunk->md_blocks[0])); + if (!chunk->md_blocks) + goto md_blocks_fail; + + pcpu_init_md_blocks(chunk); + + /* init metadata */ + chunk->contig_bits = region_bits; + chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; return chunk; + +md_blocks_fail: + pcpu_mem_free(chunk->bound_map); +bound_map_fail: + pcpu_mem_free(chunk->alloc_map); +alloc_map_fail: + pcpu_mem_free(chunk); + + return NULL; } static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; - pcpu_mem_free(chunk->map); + pcpu_mem_free(chunk->bound_map); + pcpu_mem_free(chunk->alloc_map); pcpu_mem_free(chunk); } @@ -748,13 +1210,17 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) * @chunk: pcpu_chunk which got populated * @page_start: the start page * @page_end: the end page + * @for_alloc: if this is to populate for allocation * * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each * successful population. + * + * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it + * is to serve an allocation in that area. */ -static void pcpu_chunk_populated(struct pcpu_chunk *chunk, - int page_start, int page_end) +static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, + int page_end, bool for_alloc) { int nr = page_end - page_start; @@ -762,7 +1228,11 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; - pcpu_nr_empty_pop_pages += nr; + + if (!for_alloc) { + chunk->nr_empty_pop_pages += nr; + pcpu_nr_empty_pop_pages += nr; + } } /** @@ -784,6 +1254,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, bitmap_clear(chunk->populated, page_start, nr); chunk->nr_populated -= nr; + chunk->nr_empty_pop_pages -= nr; pcpu_nr_empty_pop_pages -= nr; } @@ -819,18 +1290,21 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); * pcpu_chunk_addr_search - determine chunk containing specified address * @addr: address for which the chunk needs to be determined. * + * This is an internal function that handles all but static allocations. + * Static percpu address values should never be passed into the allocator. + * * RETURNS: * The address of the found chunk. */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { - /* is it in the first chunk? */ - if (pcpu_addr_in_first_chunk(addr)) { - /* is it in the reserved area? */ - if (pcpu_addr_in_reserved_chunk(addr)) - return pcpu_reserved_chunk; + /* is it in the dynamic region (first chunk)? */ + if (pcpu_addr_in_chunk(pcpu_first_chunk, addr)) return pcpu_first_chunk; - } + + /* is it in the reserved region? */ + if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr)) + return pcpu_reserved_chunk; /* * The address is relative to unit0 which might be unused and @@ -863,19 +1337,23 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, struct pcpu_chunk *chunk; const char *err; bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; - int occ_pages = 0; - int slot, off, new_alloc, cpu, ret; + int slot, off, cpu, ret; unsigned long flags; void __percpu *ptr; + size_t bits, bit_align; /* - * We want the lowest bit of offset available for in-use/free - * indicator, so force >= 16bit alignment and make size even. + * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, + * therefore alignment must be a minimum of that many bytes. + * An allocation may have internal fragmentation from rounding up + * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes. */ - if (unlikely(align < 2)) - align = 2; + if (unlikely(align < PCPU_MIN_ALLOC_SIZE)) + align = PCPU_MIN_ALLOC_SIZE; - size = ALIGN(size, 2); + size = ALIGN(size, PCPU_MIN_ALLOC_SIZE); + bits = size >> PCPU_MIN_ALLOC_SHIFT; + bit_align = align >> PCPU_MIN_ALLOC_SHIFT; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || !is_power_of_2(align))) { @@ -893,23 +1371,13 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; - if (size > chunk->contig_hint) { + off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); + if (off < 0) { err = "alloc from reserved chunk failed"; goto fail_unlock; } - while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { - spin_unlock_irqrestore(&pcpu_lock, flags); - if (is_atomic || - pcpu_extend_area_map(chunk, new_alloc) < 0) { - err = "failed to extend area map of reserved chunk"; - goto fail; - } - spin_lock_irqsave(&pcpu_lock, flags); - } - - off = pcpu_alloc_area(chunk, size, align, is_atomic, - &occ_pages); + off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found; @@ -921,31 +1389,15 @@ restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { list_for_each_entry(chunk, &pcpu_slot[slot], list) { - if (size > chunk->contig_hint) + off = pcpu_find_block_fit(chunk, bits, bit_align, + is_atomic); + if (off < 0) continue; - new_alloc = pcpu_need_to_extend(chunk, is_atomic); - if (new_alloc) { - if (is_atomic) - continue; - spin_unlock_irqrestore(&pcpu_lock, flags); - if (pcpu_extend_area_map(chunk, - new_alloc) < 0) { - err = "failed to extend area map"; - goto fail; - } - spin_lock_irqsave(&pcpu_lock, flags); - /* - * pcpu_lock has been dropped, need to - * restart cpu_slot list walking. - */ - goto restart; - } - - off = pcpu_alloc_area(chunk, size, align, is_atomic, - &occ_pages); + off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found; + } } @@ -987,30 +1439,25 @@ area_found: page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + pcpu_for_each_unpop_region(chunk->populated, rs, re, + page_start, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { - pcpu_free_area(chunk, off, &occ_pages); + pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } - pcpu_chunk_populated(chunk, rs, re); + pcpu_chunk_populated(chunk, rs, re, true); spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); } - if (chunk != pcpu_reserved_chunk) { - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_nr_empty_pop_pages -= occ_pages; - spin_unlock_irqrestore(&pcpu_lock, flags); - } - if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); @@ -1128,7 +1575,6 @@ static void pcpu_balance_workfn(struct work_struct *work) if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; - list_del_init(&chunk->map_extend_list); list_move(&chunk->list, &to_free); } @@ -1137,7 +1583,8 @@ static void pcpu_balance_workfn(struct work_struct *work) list_for_each_entry_safe(chunk, next, &to_free, list) { int rs, re; - pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_for_each_pop_region(chunk->populated, rs, re, 0, + chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); @@ -1146,25 +1593,6 @@ static void pcpu_balance_workfn(struct work_struct *work) pcpu_destroy_chunk(chunk); } - /* service chunks which requested async area map extension */ - do { - int new_alloc = 0; - - spin_lock_irq(&pcpu_lock); - - chunk = list_first_entry_or_null(&pcpu_map_extend_chunks, - struct pcpu_chunk, map_extend_list); - if (chunk) { - list_del_init(&chunk->map_extend_list); - new_alloc = pcpu_need_to_extend(chunk, false); - } - - spin_unlock_irq(&pcpu_lock); - - if (new_alloc) - pcpu_extend_area_map(chunk, new_alloc); - } while (chunk); - /* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic @@ -1194,7 +1622,7 @@ retry_pop: spin_lock_irq(&pcpu_lock); list_for_each_entry(chunk, &pcpu_slot[slot], list) { - nr_unpop = pcpu_unit_pages - chunk->nr_populated; + nr_unpop = chunk->nr_pages - chunk->nr_populated; if (nr_unpop) break; } @@ -1204,14 +1632,15 @@ retry_pop: continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ - pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_for_each_unpop_region(chunk->populated, rs, re, 0, + chunk->nr_pages) { int nr = min(re - rs, nr_to_pop); ret = pcpu_populate_chunk(chunk, rs, rs + nr); if (!ret) { nr_to_pop -= nr; spin_lock_irq(&pcpu_lock); - pcpu_chunk_populated(chunk, rs, rs + nr); + pcpu_chunk_populated(chunk, rs, rs + nr, false); spin_unlock_irq(&pcpu_lock); } else { nr_to_pop = 0; @@ -1250,7 +1679,7 @@ void free_percpu(void __percpu *ptr) void *addr; struct pcpu_chunk *chunk; unsigned long flags; - int off, occ_pages; + int off; if (!ptr) return; @@ -1264,13 +1693,10 @@ void free_percpu(void __percpu *ptr) chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; - pcpu_free_area(chunk, off, &occ_pages); - - if (chunk != pcpu_reserved_chunk) - pcpu_nr_empty_pop_pages += occ_pages; + pcpu_free_area(chunk, off); /* if there are more than one fully free chunks, wake up grim reaper */ - if (chunk->free_size == pcpu_unit_size) { + if (chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos; list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) @@ -1361,10 +1787,16 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. + * + * The address check is against full chunk sizes. pcpu_base_addr + * points to the beginning of the first chunk including the + * static region. Assumes good intent as the first chunk may + * not be full (ie. < pcpu_unit_pages in size). */ - first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); - first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, - pcpu_unit_pages); + first_low = (unsigned long)pcpu_base_addr + + pcpu_unit_page_offset(pcpu_low_unit_cpu, 0); + first_high = (unsigned long)pcpu_base_addr + + pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages); if ((unsigned long)addr >= first_low && (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { @@ -1546,12 +1978,13 @@ static void pcpu_dump_alloc_info(const char *lvl, * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. * - * If the first chunk ends up with both reserved and dynamic areas, it - * is served by two chunks - one to serve the core static and reserved - * areas and the other for the dynamic area. They share the same vm - * and page map but uses different area allocation map to stay away - * from each other. The latter chunk is circulated in the chunk slots - * and available for dynamic allocation like any other chunks. + * The first chunk will always contain a static and a dynamic region. + * However, the static region is not managed by any chunk. If the first + * chunk also contains a reserved region, it is served by two chunks - + * one for the reserved region and one for the dynamic region. They + * share the same vm, but use offset regions in the area allocation map. + * The chunk serving the dynamic region is circulated in the chunk slots + * and available for dynamic allocation like any other chunk. * * RETURNS: * 0 on success, -errno on failure. @@ -1559,17 +1992,17 @@ static void pcpu_dump_alloc_info(const char *lvl, int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { - static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; - static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; - size_t dyn_size = ai->dyn_size; - size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; - struct pcpu_chunk *schunk, *dchunk = NULL; + size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + size_t static_size, dyn_size; + struct pcpu_chunk *chunk; unsigned long *group_offsets; size_t *group_sizes; unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; + int map_size; + unsigned long tmp_addr; #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ @@ -1592,7 +2025,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE)); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); + PCPU_SETUP_BUG_ON(!ai->dyn_size); + PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE)); + PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) || + IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE))); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ @@ -1671,64 +2109,41 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, INIT_LIST_HEAD(&pcpu_slot[i]); /* - * Initialize static chunk. If reserved_size is zero, the - * static chunk covers static area + dynamic allocation area - * in the first chunk. If reserved_size is not zero, it - * covers static area + reserved area (mostly used for module - * static percpu allocation). + * The end of the static region needs to be aligned with the + * minimum allocation size as this offsets the reserved and + * dynamic region. The first chunk ends page aligned by + * expanding the dynamic region, therefore the dynamic region + * can be shrunk to compensate while still staying above the + * configured sizes. */ - schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); - INIT_LIST_HEAD(&schunk->list); - INIT_LIST_HEAD(&schunk->map_extend_list); - schunk->base_addr = base_addr; - schunk->map = smap; - schunk->map_alloc = ARRAY_SIZE(smap); - schunk->immutable = true; - bitmap_fill(schunk->populated, pcpu_unit_pages); - schunk->nr_populated = pcpu_unit_pages; + static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE); + dyn_size = ai->dyn_size - (static_size - ai->static_size); - if (ai->reserved_size) { - schunk->free_size = ai->reserved_size; - pcpu_reserved_chunk = schunk; - pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; - } else { - schunk->free_size = dyn_size; - dyn_size = 0; /* dynamic area covered */ - } - schunk->contig_hint = schunk->free_size; - - schunk->map[0] = 1; - schunk->map[1] = ai->static_size; - schunk->map_used = 1; - if (schunk->free_size) - schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size; - schunk->map[schunk->map_used] |= 1; - schunk->has_reserved = true; + /* + * Initialize first chunk. + * If the reserved_size is non-zero, this initializes the reserved + * chunk. If the reserved_size is zero, the reserved chunk is NULL + * and the dynamic region is initialized here. The first chunk, + * pcpu_first_chunk, will always point to the chunk that serves + * the dynamic region. + */ + tmp_addr = (unsigned long)base_addr + static_size; + map_size = ai->reserved_size ?: dyn_size; + chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); /* init dynamic chunk if necessary */ - if (dyn_size) { - dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); - INIT_LIST_HEAD(&dchunk->list); - INIT_LIST_HEAD(&dchunk->map_extend_list); - dchunk->base_addr = base_addr; - dchunk->map = dmap; - dchunk->map_alloc = ARRAY_SIZE(dmap); - dchunk->immutable = true; - bitmap_fill(dchunk->populated, pcpu_unit_pages); - dchunk->nr_populated = pcpu_unit_pages; - - dchunk->contig_hint = dchunk->free_size = dyn_size; - dchunk->map[0] = 1; - dchunk->map[1] = pcpu_reserved_chunk_limit; - dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; - dchunk->map_used = 2; - dchunk->has_reserved = true; + if (ai->reserved_size) { + pcpu_reserved_chunk = chunk; + + tmp_addr = (unsigned long)base_addr + static_size + + ai->reserved_size; + map_size = dyn_size; + chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); } /* link the first chunk in */ - pcpu_first_chunk = dchunk ?: schunk; - pcpu_nr_empty_pop_pages += - pcpu_count_occupied_pages(pcpu_first_chunk, 1); + pcpu_first_chunk = chunk; + pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); pcpu_stats_chunk_alloc(); @@ -1842,6 +2257,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( */ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + /* determine the maximum # of units that can fit in an allocation */ alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; while (alloc_size % upa || (offset_in_page(alloc_size / upa))) @@ -1868,9 +2284,9 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( } /* - * Expand unit size until address space usage goes over 75% - * and then as much as possible without using more address - * space. + * Wasted space is caused by a ratio imbalance of upa to group_cnt. + * Expand the unit_size until we use >= 75% of the units allocated. + * Related to atom_size, which could be much larger than the unit_size. */ last_allocs = INT_MAX; for (upa = max_upa; upa; upa--) { @@ -2299,36 +2715,6 @@ void __init setup_per_cpu_areas(void) #endif /* CONFIG_SMP */ /* - * First and reserved chunks are initialized with temporary allocation - * map in initdata so that they can be used before slab is online. - * This function is called after slab is brought up and replaces those - * with properly allocated maps. - */ -void __init percpu_init_late(void) -{ - struct pcpu_chunk *target_chunks[] = - { pcpu_first_chunk, pcpu_reserved_chunk, NULL }; - struct pcpu_chunk *chunk; - unsigned long flags; - int i; - - for (i = 0; (chunk = target_chunks[i]); i++) { - int *map; - const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]); - - BUILD_BUG_ON(size > PAGE_SIZE); - - map = pcpu_mem_zalloc(size); - BUG_ON(!map); - - spin_lock_irqsave(&pcpu_lock, flags); - memcpy(map, chunk->map, size); - chunk->map = map; - spin_unlock_irqrestore(&pcpu_lock, flags); - } -} - -/* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up * and running. diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c99d9512a45b..1175f6a24fdb 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -124,7 +124,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && + !pmd_devmap(*pmdp)) || !pmd_present(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; diff --git a/mm/rmap.c b/mm/rmap.c index c1286d47aa1f..b874c4761e84 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -63,6 +63,7 @@ #include <linux/hugetlb.h> #include <linux/backing-dev.h> #include <linux/page_idle.h> +#include <linux/memremap.h> #include <asm/tlbflush.h> @@ -390,7 +391,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ - if (RB_EMPTY_ROOT(&anon_vma->rb_root)) { + if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { anon_vma->parent->degree--; continue; } @@ -424,7 +425,7 @@ static void anon_vma_ctor(void *data) init_rwsem(&anon_vma->rwsem); atomic_set(&anon_vma->refcount, 0); - anon_vma->rb_root = RB_ROOT; + anon_vma->rb_root = RB_ROOT_CACHED; } void __init anon_vma_init(void) @@ -887,11 +888,21 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, .address = address, .flags = PVMW_SYNC, }; + unsigned long start = address, end; int *cleaned = arg; - bool invalidation_needed = false; + + /* + * We have to assume the worse case ie pmd for invalidation. Note that + * the page can not be free from this function. + */ + end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); while (page_vma_mapped_walk(&pvmw)) { + unsigned long cstart, cend; int ret = 0; + + cstart = address = pvmw.address; if (pvmw.pte) { pte_t entry; pte_t *pte = pvmw.pte; @@ -899,11 +910,12 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, if (!pte_dirty(*pte) && !pte_write(*pte)) continue; - flush_cache_page(vma, pvmw.address, pte_pfn(*pte)); - entry = ptep_clear_flush(vma, pvmw.address, pte); + flush_cache_page(vma, address, pte_pfn(*pte)); + entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); - set_pte_at(vma->vm_mm, pvmw.address, pte, entry); + set_pte_at(vma->vm_mm, address, pte, entry); + cend = cstart + PAGE_SIZE; ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE @@ -913,11 +925,13 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) continue; - flush_cache_page(vma, pvmw.address, page_to_pfn(page)); - entry = pmdp_huge_clear_flush(vma, pvmw.address, pmd); + flush_cache_page(vma, address, page_to_pfn(page)); + entry = pmdp_huge_clear_flush(vma, address, pmd); entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); - set_pmd_at(vma->vm_mm, pvmw.address, pmd, entry); + set_pmd_at(vma->vm_mm, address, pmd, entry); + cstart &= PMD_MASK; + cend = cstart + PMD_SIZE; ret = 1; #else /* unexpected pmd-mapped page? */ @@ -926,15 +940,12 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, } if (ret) { + mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); (*cleaned)++; - invalidation_needed = true; } } - if (invalidation_needed) { - mmu_notifier_invalidate_range(vma->vm_mm, address, - address + (1UL << compound_order(page))); - } + mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); return true; } @@ -1328,19 +1339,45 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, }; pte_t pteval; struct page *subpage; - bool ret = true, invalidation_needed = false; + bool ret = true; + unsigned long start = address, end; enum ttu_flags flags = (enum ttu_flags)arg; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) return true; + if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && + is_zone_device_page(page) && !is_device_private_page(page)) + return true; + if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_address(vma, address, - flags & TTU_MIGRATION, page); + flags & TTU_SPLIT_FREEZE, page); } + /* + * We have to assume the worse case ie pmd for invalidation. Note that + * the page can not be free in this function as call of try_to_unmap() + * must hold a reference on the page. + */ + end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + while (page_vma_mapped_walk(&pvmw)) { +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + /* PMD-mapped THP migration entry */ + if (!pvmw.pte && (flags & TTU_MIGRATION)) { + VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); + + if (!PageAnon(page)) + continue; + + set_pmd_migration_entry(&pvmw, page); + continue; + } +#endif + /* * If the page is mlock()d, we cannot swap it out. * If it's recently referenced (perhaps page_referenced @@ -1368,9 +1405,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!pvmw.pte, page); subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); + address = pvmw.address; + + + if (IS_ENABLED(CONFIG_MIGRATION) && + (flags & TTU_MIGRATION) && + is_zone_device_page(page)) { + swp_entry_t entry; + pte_t swp_pte; + + pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte); + + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + entry = make_migration_entry(page, 0); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + goto discard; + } if (!(flags & TTU_IGNORE_ACCESS)) { - if (ptep_clear_flush_young_notify(vma, pvmw.address, + if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { ret = false; page_vma_mapped_walk_done(&pvmw); @@ -1379,7 +1439,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } /* Nuke the page table entry. */ - flush_cache_page(vma, pvmw.address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); if (should_defer_flush(mm, flags)) { /* * We clear the PTE but do not flush so potentially @@ -1389,12 +1449,11 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ - pteval = ptep_get_and_clear(mm, pvmw.address, - pvmw.pte); + pteval = ptep_get_and_clear(mm, address, pvmw.pte); set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); } else { - pteval = ptep_clear_flush(vma, pvmw.address, pvmw.pte); + pteval = ptep_clear_flush(vma, address, pvmw.pte); } /* Move the dirty bit to the page. Now the pte is gone. */ @@ -1409,12 +1468,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageHuge(page)) { int nr = 1 << compound_order(page); hugetlb_count_sub(nr, mm); - set_huge_swap_pte_at(mm, pvmw.address, + set_huge_swap_pte_at(mm, address, pvmw.pte, pteval, vma_mmu_pagesize(vma)); } else { dec_mm_counter(mm, mm_counter(page)); - set_pte_at(mm, pvmw.address, pvmw.pte, pteval); + set_pte_at(mm, address, pvmw.pte, pteval); } } else if (pte_unused(pteval)) { @@ -1425,7 +1484,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ dec_mm_counter(mm, mm_counter(page)); } else if (IS_ENABLED(CONFIG_MIGRATION) && - (flags & TTU_MIGRATION)) { + (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { swp_entry_t entry; pte_t swp_pte; /* @@ -1438,7 +1497,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); - set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + set_pte_at(mm, address, pvmw.pte, swp_pte); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(subpage) }; pte_t swp_pte; @@ -1449,6 +1508,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { WARN_ON_ONCE(1); ret = false; + /* We have to invalidate as we cleared the pte */ page_vma_mapped_walk_done(&pvmw); break; } @@ -1464,7 +1524,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If the page was redirtied, it cannot be * discarded. Remap the page to page table. */ - set_pte_at(mm, pvmw.address, pvmw.pte, pteval); + set_pte_at(mm, address, pvmw.pte, pteval); SetPageSwapBacked(page); ret = false; page_vma_mapped_walk_done(&pvmw); @@ -1472,7 +1532,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } if (swap_duplicate(entry) < 0) { - set_pte_at(mm, pvmw.address, pvmw.pte, pteval); + set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; @@ -1488,18 +1548,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); - set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + set_pte_at(mm, address, pvmw.pte, swp_pte); } else dec_mm_counter(mm, mm_counter_file(page)); discard: page_remove_rmap(subpage, PageHuge(page)); put_page(page); - invalidation_needed = true; + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); } - if (invalidation_needed) - mmu_notifier_invalidate_range(mm, address, - address + (1UL << compound_order(page))); + mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + return ret; } @@ -1554,7 +1614,8 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ - if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) + if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE)) + && !PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) diff --git a/mm/shmem.c b/mm/shmem.c index fbcb3c96a186..07a1d22807be 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -34,6 +34,7 @@ #include <linux/swap.h> #include <linux/uio.h> #include <linux/khugepaged.h> +#include <linux/hugetlb.h> #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ @@ -188,6 +189,38 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } +static inline bool shmem_inode_acct_block(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + + if (shmem_acct_block(info->flags, pages)) + return false; + + if (sbinfo->max_blocks) { + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks - pages) > 0) + goto unacct; + percpu_counter_add(&sbinfo->used_blocks, pages); + } + + return true; + +unacct: + shmem_unacct_blocks(info->flags, pages); + return false; +} + +static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + + if (sbinfo->max_blocks) + percpu_counter_sub(&sbinfo->used_blocks, pages); + shmem_unacct_blocks(info->flags, pages); +} + static const struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; @@ -249,23 +282,20 @@ static void shmem_recalc_inode(struct inode *inode) freed = info->alloced - info->swapped - inode->i_mapping->nrpages; if (freed > 0) { - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -freed); info->alloced -= freed; inode->i_blocks -= freed * BLOCKS_PER_PAGE; - shmem_unacct_blocks(info->flags, freed); + shmem_inode_unacct_blocks(inode, freed); } } bool shmem_charge(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); unsigned long flags; - if (shmem_acct_block(info->flags, pages)) + if (!shmem_inode_acct_block(inode, pages)) return false; + spin_lock_irqsave(&info->lock, flags); info->alloced += pages; inode->i_blocks += pages * BLOCKS_PER_PAGE; @@ -273,26 +303,12 @@ bool shmem_charge(struct inode *inode, long pages) spin_unlock_irqrestore(&info->lock, flags); inode->i_mapping->nrpages += pages; - if (!sbinfo->max_blocks) - return true; - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - pages) > 0) { - inode->i_mapping->nrpages -= pages; - spin_lock_irqsave(&info->lock, flags); - info->alloced -= pages; - shmem_recalc_inode(inode); - spin_unlock_irqrestore(&info->lock, flags); - shmem_unacct_blocks(info->flags, pages); - return false; - } - percpu_counter_add(&sbinfo->used_blocks, pages); return true; } void shmem_uncharge(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); unsigned long flags; spin_lock_irqsave(&info->lock, flags); @@ -301,9 +317,7 @@ void shmem_uncharge(struct inode *inode, long pages) shmem_recalc_inode(inode); spin_unlock_irqrestore(&info->lock, flags); - if (sbinfo->max_blocks) - percpu_counter_sub(&sbinfo->used_blocks, pages); - shmem_unacct_blocks(info->flags, pages); + shmem_inode_unacct_blocks(inode, pages); } /* @@ -1452,9 +1466,10 @@ static struct page *shmem_alloc_page(gfp_t gfp, } static struct page *shmem_alloc_and_acct_page(gfp_t gfp, - struct shmem_inode_info *info, struct shmem_sb_info *sbinfo, + struct inode *inode, pgoff_t index, bool huge) { + struct shmem_inode_info *info = SHMEM_I(inode); struct page *page; int nr; int err = -ENOSPC; @@ -1463,14 +1478,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, huge = false; nr = huge ? HPAGE_PMD_NR : 1; - if (shmem_acct_block(info->flags, nr)) + if (!shmem_inode_acct_block(inode, nr)) goto failed; - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - nr) > 0) - goto unacct; - percpu_counter_add(&sbinfo->used_blocks, nr); - } if (huge) page = shmem_alloc_hugepage(gfp, info, index); @@ -1483,10 +1492,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, } err = -ENOMEM; - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -nr); -unacct: - shmem_unacct_blocks(info->flags, nr); + shmem_inode_unacct_blocks(inode, nr); failed: return ERR_PTR(err); } @@ -1644,7 +1650,7 @@ repeat: if (swap.val) { /* Look it up and read it in.. */ - page = lookup_swap_cache(swap); + page = lookup_swap_cache(swap, NULL, 0); if (!page) { /* Or update major stats only when swapin succeeds?? */ if (fault_type) { @@ -1751,10 +1757,9 @@ repeat: } alloc_huge: - page = shmem_alloc_and_acct_page(gfp, info, sbinfo, - index, true); + page = shmem_alloc_and_acct_page(gfp, inode, index, true); if (IS_ERR(page)) { -alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo, +alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, index, false); } if (IS_ERR(page)) { @@ -1876,10 +1881,7 @@ clear: * Error recovery. */ unacct: - if (sbinfo->max_blocks) - percpu_counter_sub(&sbinfo->used_blocks, - 1 << compound_order(page)); - shmem_unacct_blocks(info->flags, 1 << compound_order(page)); + shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); if (PageTransHuge(page)) { unlock_page(page); @@ -2206,16 +2208,16 @@ bool shmem_mapping(struct address_space *mapping) return mapping->a_ops == &shmem_aops; } -int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep) +static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + bool zeropage, + struct page **pagep) { struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); @@ -2227,33 +2229,30 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, int ret; ret = -ENOMEM; - if (shmem_acct_block(info->flags, 1)) + if (!shmem_inode_acct_block(inode, 1)) goto out; - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks) >= 0) - goto out_unacct_blocks; - percpu_counter_inc(&sbinfo->used_blocks); - } if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) - goto out_dec_used_blocks; - - page_kaddr = kmap_atomic(page); - ret = copy_from_user(page_kaddr, (const void __user *)src_addr, - PAGE_SIZE); - kunmap_atomic(page_kaddr); - - /* fallback to copy_from_user outside mmap_sem */ - if (unlikely(ret)) { - *pagep = page; - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -1); - shmem_unacct_blocks(info->flags, 1); - /* don't free the page */ - return -EFAULT; + goto out_unacct_blocks; + + if (!zeropage) { /* mcopy_atomic */ + page_kaddr = kmap_atomic(page); + ret = copy_from_user(page_kaddr, + (const void __user *)src_addr, + PAGE_SIZE); + kunmap_atomic(page_kaddr); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + *pagep = page; + shmem_inode_unacct_blocks(inode, 1); + /* don't free the page */ + return -EFAULT; + } + } else { /* mfill_zeropage_atomic */ + clear_highpage(page); } } else { page = *pagep; @@ -2314,14 +2313,33 @@ out_release_uncharge: out_release: unlock_page(page); put_page(page); -out_dec_used_blocks: - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -1); out_unacct_blocks: - shmem_unacct_blocks(info->flags, 1); + shmem_inode_unacct_blocks(inode, 1); goto out; } +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, false, pagep); +} + +int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + struct page *page = NULL; + + return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, 0, true, &page); +} + #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; @@ -3635,7 +3653,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) SYSCALL_DEFINE2(memfd_create, const char __user *, uname, @@ -3647,8 +3665,18 @@ SYSCALL_DEFINE2(memfd_create, char *name; long len; - if (flags & ~(unsigned int)MFD_ALL_FLAGS) - return -EINVAL; + if (!(flags & MFD_HUGETLB)) { + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + } else { + /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */ + if (flags & MFD_ALLOW_SEALING) + return -EINVAL; + /* Allow huge page size encoding in flags. */ + if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) + return -EINVAL; + } /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); @@ -3657,7 +3685,7 @@ SYSCALL_DEFINE2(memfd_create, if (len > MFD_NAME_MAX_LEN + 1) return -EINVAL; - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); if (!name) return -ENOMEM; @@ -3679,16 +3707,30 @@ SYSCALL_DEFINE2(memfd_create, goto err_name; } - file = shmem_file_setup(name, 0, VM_NORESERVE); + if (flags & MFD_HUGETLB) { + struct user_struct *user = NULL; + + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, + (flags >> MFD_HUGE_SHIFT) & + MFD_HUGE_MASK); + } else + file = shmem_file_setup(name, 0, VM_NORESERVE); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_fd; } - info = SHMEM_I(file_inode(file)); file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_RDWR | O_LARGEFILE; - if (flags & MFD_ALLOW_SEALING) + + if (flags & MFD_ALLOW_SEALING) { + /* + * flags check at beginning of function ensures + * this is not a hugetlbfs (MFD_HUGETLB) file. + */ + info = SHMEM_I(file_inode(file)); info->seals &= ~F_SEAL_SEAL; + } fd_install(fd, file); kfree(name); diff --git a/mm/slab.h b/mm/slab.h index 6885e1192ec5..073362816acc 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -43,6 +43,7 @@ struct kmem_cache { #include <linux/kasan.h> #include <linux/kmemleak.h> #include <linux/random.h> +#include <linux/sched/mm.h> /* * State of the slab allocator. @@ -412,7 +413,10 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); + + fs_reclaim_acquire(flags); + fs_reclaim_release(flags); + might_sleep_if(gfpflags_allow_blocking(flags)); if (should_failslab(s, flags)) diff --git a/mm/slob.c b/mm/slob.c index 1bae78d71096..a8bd6fa11a66 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -432,7 +432,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp &= gfp_allowed_mask; - lockdep_trace_alloc(gfp); + fs_reclaim_acquire(gfp); + fs_reclaim_release(gfp); if (size < PAGE_SIZE - align) { if (!size) @@ -538,7 +539,8 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); + fs_reclaim_acquire(flags); + fs_reclaim_release(flags); if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); diff --git a/mm/slub.c b/mm/slub.c index e8b4e31162ca..163352c537ab 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -34,6 +34,7 @@ #include <linux/stacktrace.h> #include <linux/prefetch.h> #include <linux/memcontrol.h> +#include <linux/random.h> #include <trace/events/kmem.h> @@ -238,30 +239,62 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ +/* + * Returns freelist pointer (ptr). With hardening, this is obfuscated + * with an XOR of the address where the pointer is held and a per-cache + * random number. + */ +static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, + unsigned long ptr_addr) +{ +#ifdef CONFIG_SLAB_FREELIST_HARDENED + return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr); +#else + return ptr; +#endif +} + +/* Returns the freelist pointer recorded at location ptr_addr. */ +static inline void *freelist_dereference(const struct kmem_cache *s, + void *ptr_addr) +{ + return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr), + (unsigned long)ptr_addr); +} + static inline void *get_freepointer(struct kmem_cache *s, void *object) { - return *(void **)(object + s->offset); + return freelist_dereference(s, object + s->offset); } static void prefetch_freepointer(const struct kmem_cache *s, void *object) { - prefetch(object + s->offset); + if (object) + prefetch(freelist_dereference(s, object + s->offset)); } static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { + unsigned long freepointer_addr; void *p; if (!debug_pagealloc_enabled()) return get_freepointer(s, object); - probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); - return p; + freepointer_addr = (unsigned long)object + s->offset; + probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p)); + return freelist_ptr(s, p, freepointer_addr); } static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) { - *(void **)(object + s->offset) = fp; + unsigned long freeptr_addr = (unsigned long)object + s->offset; + +#ifdef CONFIG_SLAB_FREELIST_HARDENED + BUG_ON(object == fp); /* naive detection of double free or corruption */ +#endif + + *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); } /* Loop over all objects in a slab */ @@ -3358,8 +3391,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) struct kmem_cache_node *n; for_each_kmem_cache_node(s, node, n) { - kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; + kmem_cache_free(kmem_cache_node, n); } } @@ -3389,8 +3422,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) return 0; } - s->node[node] = n; init_kmem_cache_node(n); + s->node[node] = n; } return 1; } @@ -3563,6 +3596,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) { s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; +#ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); +#endif if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) s->reserved = sizeof(struct rcu_head); @@ -4196,7 +4232,7 @@ void __init kmem_cache_init(void) cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, slub_cpu_dead); - pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%u, Nodes=%d\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); @@ -4561,7 +4597,7 @@ static int list_locations(struct kmem_cache *s, char *buf, struct kmem_cache_node *n; if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), - GFP_TEMPORARY)) { + GFP_KERNEL)) { kfree(map); return sprintf(buf, "Out of memory\n"); } @@ -5423,7 +5459,7 @@ static struct attribute *slab_attrs[] = { NULL }; -static struct attribute_group slab_attr_group = { +static const struct attribute_group slab_attr_group = { .attrs = slab_attrs, }; diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index c50b1a14d55e..d1a39b8051e0 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -54,14 +54,9 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) if (slab_is_available()) { struct page *page; - if (node_state(node, N_HIGH_MEMORY)) - page = alloc_pages_node( - node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, - get_order(size)); - else - page = alloc_pages( - GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, - get_order(size)); + page = alloc_pages_node(node, + GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, + get_order(size)); if (page) return page_address(page); return NULL; diff --git a/mm/sparse.c b/mm/sparse.c index 7b4be3fd5cac..83b3bf6461af 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -65,14 +65,10 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - if (slab_is_available()) { - if (node_state(nid, N_HIGH_MEMORY)) - section = kzalloc_node(array_size, GFP_KERNEL, nid); - else - section = kzalloc(array_size, GFP_KERNEL); - } else { + if (slab_is_available()) + section = kzalloc_node(array_size, GFP_KERNEL, nid); + else section = memblock_virt_alloc_node(array_size, nid); - } return section; } @@ -630,7 +626,7 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(start_pfn); + unsigned long section_nr = pfn_to_section_nr(pfn); struct mem_section *ms; /* onlining code should never touch invalid ranges */ diff --git a/mm/swap.c b/mm/swap.c index 60b1d2a75852..9295ae960d66 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -765,6 +765,17 @@ void release_pages(struct page **pages, int nr, bool cold) if (is_huge_zero_page(page)) continue; + /* Device public page can not be huge page */ + if (is_device_public_page(page)) { + if (locked_pgdat) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, + flags); + locked_pgdat = NULL; + } + put_zone_device_private_or_public_page(page); + continue; + } + page = compound_head(page); if (!put_page_testzero(page)) continue; @@ -946,28 +957,34 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) } /** - * pagevec_lookup - gang pagecache lookup + * pagevec_lookup_range - gang pagecache lookup * @pvec: Where the resulting pages are placed * @mapping: The address_space to search * @start: The starting page index + * @end: The final page index * @nr_pages: The maximum number of pages * - * pagevec_lookup() will search for and return a group of up to @nr_pages pages - * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a + * pagevec_lookup_range() will search for and return a group of up to @nr_pages + * pages in the mapping starting from index @start and upto index @end + * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a * reference against the pages in @pvec. * * The search returns a group of mapping-contiguous pages with ascending - * indexes. There may be holes in the indices due to not-present pages. + * indexes. There may be holes in the indices due to not-present pages. We + * also update @start to index the next page for the traversal. * - * pagevec_lookup() returns the number of pages which were found. + * pagevec_lookup_range() returns the number of pages which were found. If this + * number is smaller than @nr_pages, the end of specified range has been + * reached. */ -unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned nr_pages) +unsigned pagevec_lookup_range(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *start, pgoff_t end) { - pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); + pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE, + pvec->pages); return pagevec_count(pvec); } -EXPORT_SYMBOL(pagevec_lookup); +EXPORT_SYMBOL(pagevec_lookup_range); unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, unsigned nr_pages) diff --git a/mm/swap_state.c b/mm/swap_state.c index b68c93014f50..71ce2d1ccbf7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = { struct address_space *swapper_spaces[MAX_SWAPFILES]; static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; +bool swap_vma_readahead = true; + +#define SWAP_RA_MAX_ORDER_DEFAULT 3 + +static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT; + +#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) +#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) +#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK +#define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) + +#define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) +#define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) +#define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) + +#define SWAP_RA_VAL(addr, win, hits) \ + (((addr) & PAGE_MASK) | \ + (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ + ((hits) & SWAP_RA_HITS_MASK)) + +/* Initial readahead hits is 4 to start up with a small window */ +#define GET_SWAP_RA_VAL(vma) \ + (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) #define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) @@ -297,19 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr) * lock getting page table operations atomic even if we drop the page * lock before returning. */ -struct page * lookup_swap_cache(swp_entry_t entry) +struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, + unsigned long addr) { struct page *page; + unsigned long ra_info; + int win, hits, readahead; page = find_get_page(swap_address_space(entry), swp_offset(entry)); - if (page && likely(!PageTransCompound(page))) { + INC_CACHE_INFO(find_total); + if (page) { INC_CACHE_INFO(find_success); - if (TestClearPageReadahead(page)) - atomic_inc(&swapin_readahead_hits); + if (unlikely(PageTransCompound(page))) + return page; + readahead = TestClearPageReadahead(page); + if (vma) { + ra_info = GET_SWAP_RA_VAL(vma); + win = SWAP_RA_WIN(ra_info); + hits = SWAP_RA_HITS(ra_info); + if (readahead) + hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(addr, win, hits)); + } + if (readahead) { + count_vm_event(SWAP_RA_HIT); + if (!vma) + atomic_inc(&swapin_readahead_hits); + } } - - INC_CACHE_INFO(find_total); return page; } @@ -424,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return retpage; } -static unsigned long swapin_nr_pages(unsigned long offset) +static unsigned int __swapin_nr_pages(unsigned long prev_offset, + unsigned long offset, + int hits, + int max_pages, + int prev_win) { - static unsigned long prev_offset; - unsigned int pages, max_pages, last_ra; - static atomic_t last_readahead_pages; - - max_pages = 1 << READ_ONCE(page_cluster); - if (max_pages <= 1) - return 1; + unsigned int pages, last_ra; /* * This heuristic has been found to work well on both sequential and * random loads, swapping to hard disk or to SSD: please don't ask * what the "+ 2" means, it just happens to work well, that's all. */ - pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + pages = hits + 2; if (pages == 2) { /* * We can have no readahead hits to judge by: but must not get @@ -448,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset) */ if (offset != prev_offset + 1 && offset != prev_offset - 1) pages = 1; - prev_offset = offset; } else { unsigned int roundup = 4; while (roundup < pages) @@ -460,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset) pages = max_pages; /* Don't shrink readahead too fast */ - last_ra = atomic_read(&last_readahead_pages) / 2; + last_ra = prev_win / 2; if (pages < last_ra) pages = last_ra; + + return pages; +} + +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int hits, pages, max_pages; + static atomic_t last_readahead_pages; + + max_pages = 1 << READ_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + hits = atomic_xchg(&swapin_readahead_hits, 0); + pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, + atomic_read(&last_readahead_pages)); + if (!hits) + prev_offset = offset; atomic_set(&last_readahead_pages, pages); return pages; @@ -496,7 +552,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long start_offset, end_offset; unsigned long mask; struct blk_plug plug; - bool do_poll = true; + bool do_poll = true, page_allocated; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -512,12 +568,19 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ - page = read_swap_cache_async(swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr, false); + page = __read_swap_cache_async( + swp_entry(swp_type(entry), offset), + gfp_mask, vma, addr, &page_allocated); if (!page) continue; - if (offset != entry_offset && likely(!PageTransCompound(page))) - SetPageReadahead(page); + if (page_allocated) { + swap_readpage(page, false); + if (offset != entry_offset && + likely(!PageTransCompound(page))) { + SetPageReadahead(page); + count_vm_event(SWAP_RA); + } + } put_page(page); } blk_finish_plug(&plug); @@ -561,3 +624,210 @@ void exit_swap_address_space(unsigned int type) synchronize_rcu(); kvfree(spaces); } + +static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, + unsigned long faddr, + unsigned long lpfn, + unsigned long rpfn, + unsigned long *start, + unsigned long *end) +{ + *start = max3(lpfn, PFN_DOWN(vma->vm_start), + PFN_DOWN(faddr & PMD_MASK)); + *end = min3(rpfn, PFN_DOWN(vma->vm_end), + PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); +} + +struct page *swap_readahead_detect(struct vm_fault *vmf, + struct vma_swap_readahead *swap_ra) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long swap_ra_info; + struct page *page; + swp_entry_t entry; + unsigned long faddr, pfn, fpfn; + unsigned long start, end; + pte_t *pte; + unsigned int max_win, hits, prev_win, win, left; +#ifndef CONFIG_64BIT + pte_t *tpte; +#endif + + faddr = vmf->address; + entry = pte_to_swp_entry(vmf->orig_pte); + if ((unlikely(non_swap_entry(entry)))) + return NULL; + page = lookup_swap_cache(entry, vma, faddr); + if (page) + return page; + + max_win = 1 << READ_ONCE(swap_ra_max_order); + if (max_win == 1) { + swap_ra->win = 1; + return NULL; + } + + fpfn = PFN_DOWN(faddr); + swap_ra_info = GET_SWAP_RA_VAL(vma); + pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); + prev_win = SWAP_RA_WIN(swap_ra_info); + hits = SWAP_RA_HITS(swap_ra_info); + swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, + max_win, prev_win); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(faddr, win, 0)); + + if (win == 1) + return NULL; + + /* Copy the PTEs because the page table may be unmapped */ + if (fpfn == pfn + 1) + swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); + else if (pfn == fpfn + 1) + swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, + &start, &end); + else { + left = (win - 1) / 2; + swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, + &start, &end); + } + swap_ra->nr_pte = end - start; + swap_ra->offset = fpfn - start; + pte = vmf->pte - swap_ra->offset; +#ifdef CONFIG_64BIT + swap_ra->ptes = pte; +#else + tpte = swap_ra->ptes; + for (pfn = start; pfn != end; pfn++) + *tpte++ = *pte++; +#endif + + return NULL; +} + +struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, + struct vm_fault *vmf, + struct vma_swap_readahead *swap_ra) +{ + struct blk_plug plug; + struct vm_area_struct *vma = vmf->vma; + struct page *page; + pte_t *pte, pentry; + swp_entry_t entry; + unsigned int i; + bool page_allocated; + + if (swap_ra->win == 1) + goto skip; + + blk_start_plug(&plug); + for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; + i++, pte++) { + pentry = *pte; + if (pte_none(pentry)) + continue; + if (pte_present(pentry)) + continue; + entry = pte_to_swp_entry(pentry); + if (unlikely(non_swap_entry(entry))) + continue; + page = __read_swap_cache_async(entry, gfp_mask, vma, + vmf->address, &page_allocated); + if (!page) + continue; + if (page_allocated) { + swap_readpage(page, false); + if (i != swap_ra->offset && + likely(!PageTransCompound(page))) { + SetPageReadahead(page); + count_vm_event(SWAP_RA); + } + } + put_page(page); + } + blk_finish_plug(&plug); + lru_add_drain(); +skip: + return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, + swap_ra->win == 1); +} + +#ifdef CONFIG_SYSFS +static ssize_t vma_ra_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); +} +static ssize_t vma_ra_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + swap_vma_readahead = true; + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + swap_vma_readahead = false; + else + return -EINVAL; + + return count; +} +static struct kobj_attribute vma_ra_enabled_attr = + __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, + vma_ra_enabled_store); + +static ssize_t vma_ra_max_order_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", swap_ra_max_order); +} +static ssize_t vma_ra_max_order_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err, v; + + err = kstrtoint(buf, 10, &v); + if (err || v > SWAP_RA_ORDER_CEILING || v <= 0) + return -EINVAL; + + swap_ra_max_order = v; + + return count; +} +static struct kobj_attribute vma_ra_max_order_attr = + __ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show, + vma_ra_max_order_store); + +static struct attribute *swap_attrs[] = { + &vma_ra_enabled_attr.attr, + &vma_ra_max_order_attr.attr, + NULL, +}; + +static struct attribute_group swap_attr_group = { + .attrs = swap_attrs, +}; + +static int __init swap_init_sysfs(void) +{ + int err; + struct kobject *swap_kobj; + + swap_kobj = kobject_create_and_add("swap", mm_kobj); + if (!swap_kobj) { + pr_err("failed to create swap kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(swap_kobj, &swap_attr_group); + if (err) { + pr_err("failed to register swap group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(swap_kobj); + return err; +} +subsys_initcall(swap_init_sysfs); +#endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 6ba4aab2db0b..bf91dc9e7a79 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages; EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; -static int least_priority; +static int least_priority = -1; static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; @@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static PLIST_HEAD(swap_avail_head); +struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); /* Activity counter to indicate that a swapon or swapoff has occurred */ static atomic_t proc_poll_event = ATOMIC_INIT(0); +atomic_t nr_rotate_swap = ATOMIC_INIT(0); + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ @@ -265,6 +267,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } +static inline bool cluster_is_huge(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_HUGE; +} + +static inline void cluster_clear_huge(struct swap_cluster_info *info) +{ + info->flags &= ~CLUSTER_FLAG_HUGE; +} + static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { @@ -580,6 +592,21 @@ new_cluster: return found_free; } +static void __del_from_avail_list(struct swap_info_struct *p) +{ + int nid; + + for_each_node(nid) + plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); +} + +static void del_from_avail_list(struct swap_info_struct *p) +{ + spin_lock(&swap_avail_lock); + __del_from_avail_list(p); + spin_unlock(&swap_avail_lock); +} + static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { @@ -593,10 +620,20 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; - spin_lock(&swap_avail_lock); - plist_del(&si->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); + del_from_avail_list(si); + } +} + +static void add_to_avail_list(struct swap_info_struct *p) +{ + int nid; + + spin_lock(&swap_avail_lock); + for_each_node(nid) { + WARN_ON(!plist_node_empty(&p->avail_lists[nid])); + plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); } + spin_unlock(&swap_avail_lock); } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, @@ -611,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, bool was_full = !si->highest_bit; si->highest_bit = end; - if (was_full && (si->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - WARN_ON(!plist_node_empty(&si->avail_list)); - if (plist_node_empty(&si->avail_list)) - plist_add(&si->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); - } + if (was_full && (si->flags & SWP_WRITEOK)) + add_to_avail_list(si); } atomic_long_add(nr_entries, &nr_swap_pages); si->inuse_pages -= nr_entries; @@ -846,7 +878,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) offset = idx * SWAPFILE_CLUSTER; ci = lock_cluster(si, offset); alloc_cluster(si, idx); - cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0); + cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) @@ -898,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; + int node; /* Only single cluster request supported */ WARN_ON_ONCE(n_goal > 1 && cluster); @@ -917,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) spin_lock(&swap_avail_lock); start_over: - plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + node = numa_node_id(); + plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { /* requeue si to after same-priority siblings */ - plist_requeue(&si->avail_list, &swap_avail_head); + plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); - if (plist_node_empty(&si->avail_list)) { + if (plist_node_empty(&si->avail_lists[node])) { spin_unlock(&si->lock); goto nextsi; } @@ -934,13 +968,14 @@ start_over: WARN(!(si->flags & SWP_WRITEOK), "swap_info %d in list but !SWP_WRITEOK\n", si->type); - plist_del(&si->avail_list, &swap_avail_head); + __del_from_avail_list(si); spin_unlock(&si->lock); goto nextsi; } - if (cluster) - n_ret = swap_alloc_cluster(si, swp_entries); - else + if (cluster) { + if (!(si->flags & SWP_FILE)) + n_ret = swap_alloc_cluster(si, swp_entries); + } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries); spin_unlock(&si->lock); @@ -962,7 +997,7 @@ nextsi: * swap_avail_head list then try it, otherwise start over * if we have not gotten any slots. */ - if (plist_node_empty(&next->avail_list)) + if (plist_node_empty(&next->avail_lists[node])) goto start_over; } @@ -1168,22 +1203,57 @@ static void swapcache_free_cluster(swp_entry_t entry) struct swap_cluster_info *ci; struct swap_info_struct *si; unsigned char *map; - unsigned int i; + unsigned int i, free_entries = 0; + unsigned char val; - si = swap_info_get(entry); + si = _swap_info_get(entry); if (!si) return; ci = lock_cluster(si, offset); + VM_BUG_ON(!cluster_is_huge(ci)); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { - VM_BUG_ON(map[i] != SWAP_HAS_CACHE); - map[i] = 0; + val = map[i]; + VM_BUG_ON(!(val & SWAP_HAS_CACHE)); + if (val == SWAP_HAS_CACHE) + free_entries++; + } + if (!free_entries) { + for (i = 0; i < SWAPFILE_CLUSTER; i++) + map[i] &= ~SWAP_HAS_CACHE; } + cluster_clear_huge(ci); unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); - swap_free_cluster(si, idx); - spin_unlock(&si->lock); + if (free_entries == SWAPFILE_CLUSTER) { + spin_lock(&si->lock); + ci = lock_cluster(si, offset); + memset(map, 0, SWAPFILE_CLUSTER); + unlock_cluster(ci); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); + } else if (free_entries) { + for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) { + if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE)) + free_swap_slot(entry); + } + } +} + +int split_swap_cluster(swp_entry_t entry) +{ + struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + si = _swap_info_get(entry); + if (!si) + return -EBUSY; + ci = lock_cluster(si, offset); + cluster_clear_huge(ci); + unlock_cluster(ci); + return 0; } #else static inline void swapcache_free_cluster(swp_entry_t entry) @@ -1332,29 +1402,161 @@ out: return count; } +#ifdef CONFIG_THP_SWAP +static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, + swp_entry_t entry) +{ + struct swap_cluster_info *ci; + unsigned char *map = si->swap_map; + unsigned long roffset = swp_offset(entry); + unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); + int i; + bool ret = false; + + ci = lock_cluster_or_swap_info(si, offset); + if (!ci || !cluster_is_huge(ci)) { + if (map[roffset] != SWAP_HAS_CACHE) + ret = true; + goto unlock_out; + } + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + if (map[offset + i] != SWAP_HAS_CACHE) { + ret = true; + break; + } + } +unlock_out: + unlock_cluster_or_swap_info(si, ci); + return ret; +} + +static bool page_swapped(struct page *page) +{ + swp_entry_t entry; + struct swap_info_struct *si; + + if (likely(!PageTransCompound(page))) + return page_swapcount(page) != 0; + + page = compound_head(page); + entry.val = page_private(page); + si = _swap_info_get(entry); + if (si) + return swap_page_trans_huge_swapped(si, entry); + return false; +} + +static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, + int *total_swapcount) +{ + int i, map_swapcount, _total_mapcount, _total_swapcount; + unsigned long offset = 0; + struct swap_info_struct *si; + struct swap_cluster_info *ci = NULL; + unsigned char *map = NULL; + int mapcount, swapcount = 0; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + if (likely(!PageTransCompound(page))) { + mapcount = atomic_read(&page->_mapcount) + 1; + if (total_mapcount) + *total_mapcount = mapcount; + if (PageSwapCache(page)) + swapcount = page_swapcount(page); + if (total_swapcount) + *total_swapcount = swapcount; + return mapcount + swapcount; + } + + page = compound_head(page); + + _total_mapcount = _total_swapcount = map_swapcount = 0; + if (PageSwapCache(page)) { + swp_entry_t entry; + + entry.val = page_private(page); + si = _swap_info_get(entry); + if (si) { + map = si->swap_map; + offset = swp_offset(entry); + } + } + if (map) + ci = lock_cluster(si, offset); + for (i = 0; i < HPAGE_PMD_NR; i++) { + mapcount = atomic_read(&page[i]._mapcount) + 1; + _total_mapcount += mapcount; + if (map) { + swapcount = swap_count(map[offset + i]); + _total_swapcount += swapcount; + } + map_swapcount = max(map_swapcount, mapcount + swapcount); + } + unlock_cluster(ci); + if (PageDoubleMap(page)) { + map_swapcount -= 1; + _total_mapcount -= HPAGE_PMD_NR; + } + mapcount = compound_mapcount(page); + map_swapcount += mapcount; + _total_mapcount += mapcount; + if (total_mapcount) + *total_mapcount = _total_mapcount; + if (total_swapcount) + *total_swapcount = _total_swapcount; + + return map_swapcount; +} +#else +#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry) +#define page_swapped(page) (page_swapcount(page) != 0) + +static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, + int *total_swapcount) +{ + int mapcount, swapcount = 0; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + mapcount = page_trans_huge_mapcount(page, total_mapcount); + if (PageSwapCache(page)) + swapcount = page_swapcount(page); + if (total_swapcount) + *total_swapcount = swapcount; + return mapcount + swapcount; +} +#endif + /* * We can write to an anon page without COW if there are no other references * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. * - * NOTE: total_mapcount should not be relied upon by the caller if + * NOTE: total_map_swapcount should not be relied upon by the caller if * reuse_swap_page() returns false, but it may be always overwritten * (see the other implementation for CONFIG_SWAP=n). */ -bool reuse_swap_page(struct page *page, int *total_mapcount) +bool reuse_swap_page(struct page *page, int *total_map_swapcount) { - int count; + int count, total_mapcount, total_swapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return false; - count = page_trans_huge_mapcount(page, total_mapcount); - if (count <= 1 && PageSwapCache(page)) { - count += page_swapcount(page); - if (count != 1) - goto out; + count = page_trans_huge_map_swapcount(page, &total_mapcount, + &total_swapcount); + if (total_map_swapcount) + *total_map_swapcount = total_mapcount + total_swapcount; + if (count == 1 && PageSwapCache(page) && + (likely(!PageTransCompound(page)) || + /* The remaining swap count will be freed soon */ + total_swapcount == page_swapcount(page))) { if (!PageWriteback(page)) { + page = compound_head(page); delete_from_swap_cache(page); SetPageDirty(page); } else { @@ -1370,7 +1572,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount) spin_unlock(&p->lock); } } -out: + return count <= 1; } @@ -1386,7 +1588,7 @@ int try_to_free_swap(struct page *page) return 0; if (PageWriteback(page)) return 0; - if (page_swapcount(page)) + if (page_swapped(page)) return 0; /* @@ -1407,6 +1609,7 @@ int try_to_free_swap(struct page *page) if (pm_suspended_storage()) return 0; + page = compound_head(page); delete_from_swap_cache(page); SetPageDirty(page); return 1; @@ -1428,7 +1631,8 @@ int free_swap_and_cache(swp_entry_t entry) p = _swap_info_get(entry); if (p) { count = __swap_entry_free(p, entry, 1); - if (count == SWAP_HAS_CACHE) { + if (count == SWAP_HAS_CACHE && + !swap_page_trans_huge_swapped(p, entry)) { page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (page && !trylock_page(page)) { @@ -1445,7 +1649,8 @@ int free_swap_and_cache(swp_entry_t entry) */ if (PageSwapCache(page) && !PageWriteback(page) && (!page_mapped(page) || mem_cgroup_swap_full(page)) && - !swap_swapcount(p, entry)) { + !swap_page_trans_huge_swapped(p, entry)) { + page = compound_head(page); delete_from_swap_cache(page); SetPageDirty(page); } @@ -1999,7 +2204,7 @@ int try_to_unuse(unsigned int type, bool frontswap, .sync_mode = WB_SYNC_NONE, }; - swap_writepage(page, &wbc); + swap_writepage(compound_head(page), &wbc); lock_page(page); wait_on_page_writeback(page); } @@ -2012,8 +2217,9 @@ int try_to_unuse(unsigned int type, bool frontswap, * delete, since it may not have been written out to swap yet. */ if (PageSwapCache(page) && - likely(page_private(page) == entry.val)) - delete_from_swap_cache(page); + likely(page_private(page) == entry.val) && + !page_swapped(page)) + delete_from_swap_cache(compound_head(page)); /* * So we could skip searching mms once swap count went @@ -2226,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } +static int swap_node(struct swap_info_struct *p) +{ + struct block_device *bdev; + + if (p->bdev) + bdev = p->bdev; + else + bdev = p->swap_file->f_inode->i_sb->s_bdev; + + return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; +} + static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { + int i; + if (prio >= 0) p->prio = prio; else @@ -2239,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, * low-to-high, while swap ordering is high-to-low */ p->list.prio = -p->prio; - p->avail_list.prio = -p->prio; + for_each_node(i) { + if (p->prio >= 0) + p->avail_lists[i].prio = -p->prio; + else { + if (swap_node(p) == i) + p->avail_lists[i].prio = 1; + else + p->avail_lists[i].prio = -p->prio; + } + } p->swap_map = swap_map; p->cluster_info = cluster_info; p->flags |= SWP_WRITEOK; @@ -2258,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, * swap_info_struct. */ plist_add(&p->list, &swap_active_head); - spin_lock(&swap_avail_lock); - plist_add(&p->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); + add_to_avail_list(p); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -2345,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - spin_lock(&swap_avail_lock); - plist_del(&p->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); + del_from_avail_list(p); spin_lock(&p->lock); if (p->prio < 0) { struct swap_info_struct *si = p; + int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; - si->avail_list.prio--; + for_each_node(nid) { + if (si->avail_lists[nid].prio != 1) + si->avail_lists[nid].prio--; + } } least_priority++; } @@ -2387,6 +2616,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); + if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev))) + atomic_dec(&nr_rotate_swap); + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); spin_lock(&p->lock); @@ -2596,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; unsigned int type; + int i; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) @@ -2631,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void) } INIT_LIST_HEAD(&p->first_swap_extent.list); plist_node_init(&p->list, 0); - plist_node_init(&p->avail_list, 0); + for_each_node(i) + plist_node_init(&p->avail_lists[i], 0); p->flags = SWP_USED; spin_unlock(&swap_lock); spin_lock_init(&p->lock); @@ -2873,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!swap_avail_heads) + return -ENOMEM; + p = alloc_swap_info(); if (IS_ERR(p)) return PTR_ERR(p); @@ -2963,7 +3200,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) cluster = per_cpu_ptr(p->percpu_cluster, cpu); cluster_set_null(&cluster->index); } - } + } else + atomic_inc(&nr_rotate_swap); error = swap_cgroup_swapon(p->type, maxpages); if (error) @@ -3052,7 +3290,8 @@ bad_swap: p->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); - vfree(cluster_info); + kvfree(cluster_info); + kvfree(frontswap_map); if (swap_file) { if (inode && S_ISREG(inode->i_mode)) { inode_unlock(inode); @@ -3457,3 +3696,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } } } + +static int __init swapfile_init(void) +{ + int nid; + + swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), + GFP_KERNEL); + if (!swap_avail_heads) { + pr_emerg("Not enough memory for swap heads, swap is disabled\n"); + return -ENOMEM; + } + + for_each_node(nid) + plist_head_init(&swap_avail_heads[nid]); + + return 0; +} +subsys_initcall(swapfile_init); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 8bcb501bce60..81192701964d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -371,6 +371,36 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, bool zeropage); #endif /* CONFIG_HUGETLB_PAGE */ +static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **page, + bool zeropage) +{ + ssize_t err; + + if (vma_is_anonymous(dst_vma)) { + if (!zeropage) + err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, page); + else + err = mfill_zeropage_pte(dst_mm, dst_pmd, + dst_vma, dst_addr); + } else { + if (!zeropage) + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, + dst_vma, dst_addr, + src_addr, page); + else + err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, + dst_vma, dst_addr); + } + + return err; +} + static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, @@ -487,22 +517,8 @@ retry: BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd)); - if (vma_is_anonymous(dst_vma)) { - if (!zeropage) - err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, - &page); - else - err = mfill_zeropage_pte(dst_mm, dst_pmd, - dst_vma, dst_addr); - } else { - err = -EINVAL; /* if zeropage is true return -EINVAL */ - if (likely(!zeropage)) - err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, - dst_vma, dst_addr, - src_addr, &page); - } - + err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + src_addr, &page, zeropage); cond_resched(); if (unlikely(err == -EFAULT)) { diff --git a/mm/util.c b/mm/util.c index 9ecddf568fe3..34e57fae959d 100644 --- a/mm/util.c +++ b/mm/util.c @@ -614,7 +614,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_page_state(NR_FREE_PAGES); + free = global_zone_page_state(NR_FREE_PAGES); free += global_node_page_state(NR_FILE_PAGES); /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a47e3894c775..8a43db6284eb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -49,12 +49,10 @@ static void __vunmap(const void *, int); static void free_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); - struct llist_node *llnode = llist_del_all(&p->list); - while (llnode) { - void *p = llnode; - llnode = llist_next(llnode); - __vunmap(p, 1); - } + struct llist_node *t, *llnode; + + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + __vunmap((void *)llnode, 1); } /*** Page table manipulation functions ***/ @@ -2482,7 +2480,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * matching slot. While scanning, if any of the areas overlaps with * existing vmap_area, the base address is pulled down to fit the * area. Scanning is repeated till all the areas fit and then all - * necessary data structres are inserted and the result is returned. + * necessary data structures are inserted and the result is returned. */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, @@ -2510,15 +2508,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, if (start > offsets[last_area]) last_area = area; - for (area2 = 0; area2 < nr_vms; area2++) { + for (area2 = area + 1; area2 < nr_vms; area2++) { unsigned long start2 = offsets[area2]; unsigned long end2 = start2 + sizes[area2]; - if (area2 == area) - continue; - - BUG_ON(start2 >= start && start2 < end); - BUG_ON(end2 <= end && end2 > start); + BUG_ON(start2 < end && start < end2); } } last_end = offsets[last_area] + sizes[last_area]; diff --git a/mm/vmscan.c b/mm/vmscan.c index a1af041930a6..13d711dd8776 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -393,14 +393,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, unsigned long nr_to_scan = min(batch_size, total_scan); shrinkctl->nr_to_scan = nr_to_scan; + shrinkctl->nr_scanned = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; freed += ret; - count_vm_events(SLABS_SCANNED, nr_to_scan); - total_scan -= nr_to_scan; - scanned += nr_to_scan; + count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); + total_scan -= shrinkctl->nr_scanned; + scanned += shrinkctl->nr_scanned; cond_resched(); } @@ -535,7 +536,9 @@ static inline int is_page_cache_freeable(struct page *page) * that isolated the page, the page cache radix tree and * optional buffer heads at page->private. */ - return page_count(page) - page_has_private(page) == 2; + int radix_pins = PageTransHuge(page) && PageSwapCache(page) ? + HPAGE_PMD_NR : 1; + return page_count(page) - page_has_private(page) == 1 + radix_pins; } static int may_write_to_inode(struct inode *inode, struct scan_control *sc) @@ -665,6 +668,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { unsigned long flags; + int refcount; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); @@ -695,11 +699,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (!page_ref_freeze(page, 2)) + if (unlikely(PageTransHuge(page)) && PageSwapCache(page)) + refcount = 1 + HPAGE_PMD_NR; + else + refcount = 2; + if (!page_ref_freeze(page, refcount)) goto cannot_free; /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ if (unlikely(PageDirty(page))) { - page_ref_unfreeze(page, 2); + page_ref_unfreeze(page, refcount); goto cannot_free; } @@ -1121,58 +1129,59 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Try to allocate it some swap space here. * Lazyfree page could be freed directly */ - if (PageAnon(page) && PageSwapBacked(page) && - !PageSwapCache(page)) { - if (!(sc->gfp_mask & __GFP_IO)) - goto keep_locked; - if (PageTransHuge(page)) { - /* cannot split THP, skip it */ - if (!can_split_huge_page(page, NULL)) - goto activate_locked; - /* - * Split pages without a PMD map right - * away. Chances are some or all of the - * tail pages can be freed without IO. - */ - if (!compound_mapcount(page) && - split_huge_page_to_list(page, page_list)) - goto activate_locked; - } - if (!add_to_swap(page)) { - if (!PageTransHuge(page)) - goto activate_locked; - /* Split THP and swap individual base pages */ - if (split_huge_page_to_list(page, page_list)) - goto activate_locked; - if (!add_to_swap(page)) - goto activate_locked; - } - - /* XXX: We don't support THP writes */ - if (PageTransHuge(page) && - split_huge_page_to_list(page, page_list)) { - delete_from_swap_cache(page); - goto activate_locked; - } + if (PageAnon(page) && PageSwapBacked(page)) { + if (!PageSwapCache(page)) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + if (PageTransHuge(page)) { + /* cannot split THP, skip it */ + if (!can_split_huge_page(page, NULL)) + goto activate_locked; + /* + * Split pages without a PMD map right + * away. Chances are some or all of the + * tail pages can be freed without IO. + */ + if (!compound_mapcount(page) && + split_huge_page_to_list(page, + page_list)) + goto activate_locked; + } + if (!add_to_swap(page)) { + if (!PageTransHuge(page)) + goto activate_locked; + /* Fallback to swap normal pages */ + if (split_huge_page_to_list(page, + page_list)) + goto activate_locked; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_vm_event(THP_SWPOUT_FALLBACK); +#endif + if (!add_to_swap(page)) + goto activate_locked; + } - may_enter_fs = 1; + may_enter_fs = 1; - /* Adding to swap updated mapping */ - mapping = page_mapping(page); + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } } else if (unlikely(PageTransHuge(page))) { /* Split file THP */ if (split_huge_page_to_list(page, page_list)) goto keep_locked; } - VM_BUG_ON_PAGE(PageTransHuge(page), page); - /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ if (page_mapped(page)) { - if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { + enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + + if (unlikely(PageTransHuge(page))) + flags |= TTU_SPLIT_HUGE_PMD; + if (!try_to_unmap(page, flags)) { nr_unmap_fail++; goto activate_locked; } @@ -1312,7 +1321,11 @@ free_it: * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ - list_add(&page->lru, &free_pages); + if (unlikely(PageTransHuge(page))) { + mem_cgroup_uncharge(page); + (*get_compound_page_dtor(page))(page); + } else + list_add(&page->lru, &free_pages); continue; activate_locked: @@ -1742,9 +1755,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + bool stalled = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { - congestion_wait(BLK_RW_ASYNC, HZ/10); + if (stalled) + return 0; + + /* wait a bit for the reclaimer. */ + msleep(100); + stalled = true; /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) @@ -3525,8 +3544,6 @@ static int kswapd(void *p) }; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - lockdep_set_current_reclaim_state(GFP_KERNEL); - if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); current->reclaim_state = &reclaim_state; @@ -3585,14 +3602,15 @@ kswapd_try_sleep: */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, alloc_order); + fs_reclaim_acquire(GFP_KERNEL); reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + fs_reclaim_release(GFP_KERNEL); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); current->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); return 0; } @@ -3655,14 +3673,14 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) unsigned int noreclaim_flag; noreclaim_flag = memalloc_noreclaim_save(); - lockdep_set_current_reclaim_state(sc.gfp_mask); + fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); + fs_reclaim_release(sc.gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); return nr_reclaimed; @@ -3847,7 +3865,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - lockdep_set_current_reclaim_state(sc.gfp_mask); + fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3862,9 +3880,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; + fs_reclaim_release(gfp_mask); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); - lockdep_clear_current_reclaim_state(); return sc.nr_reclaimed >= nr_pages; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 9a4441bbeef2..4bb13e72ac97 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -30,6 +30,8 @@ #include "internal.h" +#define NUMA_STATS_THRESHOLD (U16_MAX - 2) + #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); @@ -87,8 +89,10 @@ void vm_events_fold_cpu(int cpu) * vm_stat contains the global counters */ atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; +atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp; atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; EXPORT_SYMBOL(vm_zone_stat); +EXPORT_SYMBOL(vm_numa_stat); EXPORT_SYMBOL(vm_node_stat); #ifdef CONFIG_SMP @@ -604,6 +608,32 @@ EXPORT_SYMBOL(dec_node_page_state); * Fold a differential into the global counters. * Returns the number of counters updated. */ +#ifdef CONFIG_NUMA +static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) +{ + int i; + int changes = 0; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (zone_diff[i]) { + atomic_long_add(zone_diff[i], &vm_zone_stat[i]); + changes++; + } + + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + if (numa_diff[i]) { + atomic_long_add(numa_diff[i], &vm_numa_stat[i]); + changes++; + } + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + if (node_diff[i]) { + atomic_long_add(node_diff[i], &vm_node_stat[i]); + changes++; + } + return changes; +} +#else static int fold_diff(int *zone_diff, int *node_diff) { int i; @@ -622,6 +652,7 @@ static int fold_diff(int *zone_diff, int *node_diff) } return changes; } +#endif /* CONFIG_NUMA */ /* * Update the zone counters for the current cpu. @@ -645,6 +676,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets) struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; +#ifdef CONFIG_NUMA + int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; +#endif int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; int changes = 0; @@ -666,6 +700,18 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } #ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { + int v; + + v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0); + if (v) { + + atomic_long_add(v, &zone->vm_numa_stat[i]); + global_numa_diff[i] += v; + __this_cpu_write(p->expire, 3); + } + } + if (do_pagesets) { cond_resched(); /* @@ -712,7 +758,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } +#ifdef CONFIG_NUMA + changes += fold_diff(global_zone_diff, global_numa_diff, + global_node_diff); +#else changes += fold_diff(global_zone_diff, global_node_diff); +#endif return changes; } @@ -727,6 +778,9 @@ void cpu_vm_stats_fold(int cpu) struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; +#ifdef CONFIG_NUMA + int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; +#endif int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { @@ -743,6 +797,18 @@ void cpu_vm_stats_fold(int cpu) atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; } + +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + if (p->vm_numa_stat_diff[i]) { + int v; + + v = p->vm_numa_stat_diff[i]; + p->vm_numa_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_numa_stat[i]); + global_numa_diff[i] += v; + } +#endif } for_each_online_pgdat(pgdat) { @@ -761,7 +827,11 @@ void cpu_vm_stats_fold(int cpu) } } +#ifdef CONFIG_NUMA + fold_diff(global_zone_diff, global_numa_diff, global_node_diff); +#else fold_diff(global_zone_diff, global_node_diff); +#endif } /* @@ -779,10 +849,36 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) atomic_long_add(v, &zone->vm_stat[i]); atomic_long_add(v, &vm_zone_stat[i]); } + +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + if (pset->vm_numa_stat_diff[i]) { + int v = pset->vm_numa_stat_diff[i]; + + pset->vm_numa_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_numa_stat[i]); + atomic_long_add(v, &vm_numa_stat[i]); + } +#endif } #endif #ifdef CONFIG_NUMA +void __inc_numa_state(struct zone *zone, + enum numa_stat_item item) +{ + struct per_cpu_pageset __percpu *pcp = zone->pageset; + u16 __percpu *p = pcp->vm_numa_stat_diff + item; + u16 v; + + v = __this_cpu_inc_return(*p); + + if (unlikely(v > NUMA_STATS_THRESHOLD)) { + zone_numa_state_add(v, zone, item); + __this_cpu_write(*p, 0); + } +} + /* * Determine the per node value of a stat item. This function * is called frequently in a NUMA machine, so try to be as @@ -802,6 +898,23 @@ unsigned long sum_zone_node_page_state(int node, } /* + * Determine the per node value of a numa stat item. To avoid deviation, + * the per cpu stat number in vm_numa_stat_diff[] is also included. + */ +unsigned long sum_zone_numa_state(int node, + enum numa_stat_item item) +{ + struct zone *zones = NODE_DATA(node)->node_zones; + int i; + unsigned long count = 0; + + for (i = 0; i < MAX_NR_ZONES; i++) + count += zone_numa_state_snapshot(zones + i, item); + + return count; +} + +/* * Determine the per node value of a stat item. */ unsigned long node_page_state(struct pglist_data *pgdat, @@ -870,6 +983,9 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in { unsigned long requested = 1UL << order; + if (WARN_ON_ONCE(order >= MAX_ORDER)) + return 0; + if (!info->free_blocks_total) return 0; @@ -934,6 +1050,9 @@ const char * const vmstat_text[] = { #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", #endif + "nr_free_cma", + + /* enum numa_stat_item counters */ #ifdef CONFIG_NUMA "numa_hit", "numa_miss", @@ -942,7 +1061,6 @@ const char * const vmstat_text[] = { "numa_local", "numa_other", #endif - "nr_free_cma", /* Node-based counters */ "nr_inactive_anon", @@ -1071,6 +1189,8 @@ const char * const vmstat_text[] = { #endif "thp_zero_page_alloc", "thp_zero_page_alloc_failed", + "thp_swpout", + "thp_swpout_fallback", #endif #ifdef CONFIG_MEMORY_BALLOON "balloon_inflate", @@ -1093,11 +1213,14 @@ const char * const vmstat_text[] = { "vmacache_find_hits", "vmacache_full_flushes", #endif +#ifdef CONFIG_SWAP + "swap_ra", + "swap_ra_hit", +#endif #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ - #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ defined(CONFIG_PROC_FS) static void *frag_start(struct seq_file *m, loff_t *pos) @@ -1250,7 +1373,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, seq_putc(m, '\n'); } -/* Print out the free pages at each order for each migratetype */ +/* Print out the number of pageblocks for each migratetype */ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) { int mtype; @@ -1375,7 +1498,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n per-node stats"); for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { seq_printf(m, "\n %-12s %lu", - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + + NR_VM_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); } } @@ -1412,6 +1536,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n %-12s %lu", vmstat_text[i], zone_page_state(zone, i)); +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], + zone_numa_state_snapshot(zone, i)); +#endif + seq_printf(m, "\n pagesets"); for_each_online_cpu(i) { struct per_cpu_pageset *pageset; @@ -1488,6 +1619,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) + NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); @@ -1500,9 +1632,15 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) if (!v) return ERR_PTR(-ENOMEM); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - v[i] = global_page_state(i); + v[i] = global_zone_page_state(i); v += NR_VM_ZONE_STAT_ITEMS; +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + v[i] = global_numa_state(i); + v += NR_VM_NUMA_STAT_ITEMS; +#endif + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) v[i] = global_node_page_state(i); v += NR_VM_NODE_STAT_ITEMS; @@ -1589,7 +1727,7 @@ int vmstat_refresh(struct ctl_table *table, int write, * which can equally be echo'ed to or cat'ted from (by root), * can be used to update the stats just before reading them. * - * Oh, and since global_page_state() etc. are so careful to hide + * Oh, and since global_zone_page_state() etc. are so careful to hide * transiently negative values, report an error here if any of * the stats is negative, so we know to go looking for imbalance. */ @@ -1604,6 +1742,16 @@ int vmstat_refresh(struct ctl_table *table, int write, err = -EINVAL; } } +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { + val = atomic_long_read(&vm_numa_stat[i]); + if (val < 0) { + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val); + err = -EINVAL; + } + } +#endif if (err) return err; if (write) @@ -1645,13 +1793,20 @@ static bool need_update(int cpu) struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); +#ifdef CONFIG_NUMA + BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); +#endif + /* * The fast way of checking if there are any vmstat diffs. * This works because the diffs are byte sized items. */ if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) return true; - +#ifdef CONFIG_NUMA + if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS)) + return true; +#endif } return false; } diff --git a/mm/z3fold.c b/mm/z3fold.c index 54f63c4a809a..486550df32be 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -23,10 +23,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/atomic.h> +#include <linux/sched.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/percpu.h> #include <linux/preempt.h> +#include <linux/workqueue.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/zpool.h> @@ -48,11 +51,15 @@ enum buddy { }; /* - * struct z3fold_header - z3fold page metadata occupying the first chunk of each + * struct z3fold_header - z3fold page metadata occupying first chunks of each * z3fold page, except for HEADLESS pages - * @buddy: links the z3fold page into the relevant list in the pool + * @buddy: links the z3fold page into the relevant list in the + * pool * @page_lock: per-page lock - * @refcount: reference cound for the z3fold page + * @refcount: reference count for the z3fold page + * @work: work_struct for page layout optimization + * @pool: pointer to the pool which this page belongs to + * @cpu: CPU which this page "belongs" to * @first_chunks: the size of the first buddy in chunks, 0 if free * @middle_chunks: the size of the middle buddy in chunks, 0 if free * @last_chunks: the size of the last buddy in chunks, 0 if free @@ -62,6 +69,9 @@ struct z3fold_header { struct list_head buddy; spinlock_t page_lock; struct kref refcount; + struct work_struct work; + struct z3fold_pool *pool; + short cpu; unsigned short first_chunks; unsigned short middle_chunks; unsigned short last_chunks; @@ -92,28 +102,39 @@ struct z3fold_header { /** * struct z3fold_pool - stores metadata for each z3fold pool - * @lock: protects all pool fields and first|last_chunk fields of any - * z3fold page in the pool - * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies; - * the lists each z3fold page is added to depends on the size of - * its free region. + * @name: pool name + * @lock: protects pool unbuddied/lru lists + * @stale_lock: protects pool stale page list + * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2- + * buddies; the list each z3fold page is added to depends on + * the size of its free region. * @lru: list tracking the z3fold pages in LRU order by most recently * added buddy. + * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. * @ops: pointer to a structure of user defined operations specified at * pool creation time. + * @compact_wq: workqueue for page layout background optimization + * @release_wq: workqueue for safe page release + * @work: work_struct for safe page release * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. */ struct z3fold_pool { + const char *name; spinlock_t lock; - struct list_head unbuddied[NCHUNKS]; + spinlock_t stale_lock; + struct list_head *unbuddied; struct list_head lru; + struct list_head stale; atomic64_t pages_nr; const struct z3fold_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; + struct workqueue_struct *compact_wq; + struct workqueue_struct *release_wq; + struct work_struct work; }; /* @@ -122,9 +143,10 @@ struct z3fold_pool { enum z3fold_page_flags { PAGE_HEADLESS = 0, MIDDLE_CHUNK_MAPPED, + NEEDS_COMPACTING, + PAGE_STALE }; - /***************** * Helpers *****************/ @@ -138,14 +160,19 @@ static int size_to_chunks(size_t size) #define for_each_unbuddied_list(_iter, _begin) \ for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) +static void compact_page_work(struct work_struct *w); + /* Initializes the z3fold header of a newly allocated z3fold page */ -static struct z3fold_header *init_z3fold_page(struct page *page) +static struct z3fold_header *init_z3fold_page(struct page *page, + struct z3fold_pool *pool) { struct z3fold_header *zhdr = page_address(page); INIT_LIST_HEAD(&page->lru); clear_bit(PAGE_HEADLESS, &page->private); clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + clear_bit(NEEDS_COMPACTING, &page->private); + clear_bit(PAGE_STALE, &page->private); spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); @@ -154,7 +181,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page) zhdr->last_chunks = 0; zhdr->first_num = 0; zhdr->start_middle = 0; + zhdr->cpu = -1; + zhdr->pool = pool; INIT_LIST_HEAD(&zhdr->buddy); + INIT_WORK(&zhdr->work, compact_page_work); return zhdr; } @@ -164,21 +194,6 @@ static void free_z3fold_page(struct page *page) __free_page(page); } -static void release_z3fold_page(struct kref *ref) -{ - struct z3fold_header *zhdr; - struct page *page; - - zhdr = container_of(ref, struct z3fold_header, refcount); - page = virt_to_page(zhdr); - - if (!list_empty(&zhdr->buddy)) - list_del(&zhdr->buddy); - if (!list_empty(&page->lru)) - list_del(&page->lru); - free_z3fold_page(page); -} - /* Lock a z3fold page */ static inline void z3fold_page_lock(struct z3fold_header *zhdr) { @@ -228,6 +243,76 @@ static enum buddy handle_to_buddy(unsigned long handle) return (handle - zhdr->first_num) & BUDDY_MASK; } +static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) +{ + struct page *page = virt_to_page(zhdr); + struct z3fold_pool *pool = zhdr->pool; + + WARN_ON(!list_empty(&zhdr->buddy)); + set_bit(PAGE_STALE, &page->private); + spin_lock(&pool->lock); + if (!list_empty(&page->lru)) + list_del(&page->lru); + spin_unlock(&pool->lock); + if (locked) + z3fold_page_unlock(zhdr); + spin_lock(&pool->stale_lock); + list_add(&zhdr->buddy, &pool->stale); + queue_work(pool->release_wq, &pool->work); + spin_unlock(&pool->stale_lock); +} + +static void __attribute__((__unused__)) + release_z3fold_page(struct kref *ref) +{ + struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, + refcount); + __release_z3fold_page(zhdr, false); +} + +static void release_z3fold_page_locked(struct kref *ref) +{ + struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, + refcount); + WARN_ON(z3fold_page_trylock(zhdr)); + __release_z3fold_page(zhdr, true); +} + +static void release_z3fold_page_locked_list(struct kref *ref) +{ + struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, + refcount); + spin_lock(&zhdr->pool->lock); + list_del_init(&zhdr->buddy); + spin_unlock(&zhdr->pool->lock); + + WARN_ON(z3fold_page_trylock(zhdr)); + __release_z3fold_page(zhdr, true); +} + +static void free_pages_work(struct work_struct *w) +{ + struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work); + + spin_lock(&pool->stale_lock); + while (!list_empty(&pool->stale)) { + struct z3fold_header *zhdr = list_first_entry(&pool->stale, + struct z3fold_header, buddy); + struct page *page = virt_to_page(zhdr); + + list_del(&zhdr->buddy); + if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) + continue; + clear_bit(NEEDS_COMPACTING, &page->private); + spin_unlock(&pool->stale_lock); + cancel_work_sync(&zhdr->work); + free_z3fold_page(page); + cond_resched(); + spin_lock(&pool->stale_lock); + } + spin_unlock(&pool->stale_lock); +} + /* * Returns the number of free chunks in a z3fold page. * NB: can't be used with HEADLESS pages. @@ -252,46 +337,6 @@ static int num_free_chunks(struct z3fold_header *zhdr) return nfree; } -/***************** - * API Functions -*****************/ -/** - * z3fold_create_pool() - create a new z3fold pool - * @gfp: gfp flags when allocating the z3fold pool structure - * @ops: user-defined operations for the z3fold pool - * - * Return: pointer to the new z3fold pool or NULL if the metadata allocation - * failed. - */ -static struct z3fold_pool *z3fold_create_pool(gfp_t gfp, - const struct z3fold_ops *ops) -{ - struct z3fold_pool *pool; - int i; - - pool = kzalloc(sizeof(struct z3fold_pool), gfp); - if (!pool) - return NULL; - spin_lock_init(&pool->lock); - for_each_unbuddied_list(i, 0) - INIT_LIST_HEAD(&pool->unbuddied[i]); - INIT_LIST_HEAD(&pool->lru); - atomic64_set(&pool->pages_nr, 0); - pool->ops = ops; - return pool; -} - -/** - * z3fold_destroy_pool() - destroys an existing z3fold pool - * @pool: the z3fold pool to be destroyed - * - * The pool should be emptied before this function is called. - */ -static void z3fold_destroy_pool(struct z3fold_pool *pool) -{ - kfree(pool); -} - static inline void *mchunk_memmove(struct z3fold_header *zhdr, unsigned short dst_chunk) { @@ -347,6 +392,117 @@ static int z3fold_compact_page(struct z3fold_header *zhdr) return 0; } +static void do_compact_page(struct z3fold_header *zhdr, bool locked) +{ + struct z3fold_pool *pool = zhdr->pool; + struct page *page; + struct list_head *unbuddied; + int fchunks; + + page = virt_to_page(zhdr); + if (locked) + WARN_ON(z3fold_page_trylock(zhdr)); + else + z3fold_page_lock(zhdr); + if (test_bit(PAGE_STALE, &page->private) || + !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) { + z3fold_page_unlock(zhdr); + return; + } + spin_lock(&pool->lock); + list_del_init(&zhdr->buddy); + spin_unlock(&pool->lock); + + z3fold_compact_page(zhdr); + unbuddied = get_cpu_ptr(pool->unbuddied); + fchunks = num_free_chunks(zhdr); + if (fchunks < NCHUNKS && + (!zhdr->first_chunks || !zhdr->middle_chunks || + !zhdr->last_chunks)) { + /* the page's not completely free and it's unbuddied */ + spin_lock(&pool->lock); + list_add(&zhdr->buddy, &unbuddied[fchunks]); + spin_unlock(&pool->lock); + zhdr->cpu = smp_processor_id(); + } + put_cpu_ptr(pool->unbuddied); + z3fold_page_unlock(zhdr); +} + +static void compact_page_work(struct work_struct *w) +{ + struct z3fold_header *zhdr = container_of(w, struct z3fold_header, + work); + + do_compact_page(zhdr, false); +} + + +/* + * API Functions + */ + +/** + * z3fold_create_pool() - create a new z3fold pool + * @name: pool name + * @gfp: gfp flags when allocating the z3fold pool structure + * @ops: user-defined operations for the z3fold pool + * + * Return: pointer to the new z3fold pool or NULL if the metadata allocation + * failed. + */ +static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, + const struct z3fold_ops *ops) +{ + struct z3fold_pool *pool = NULL; + int i, cpu; + + pool = kzalloc(sizeof(struct z3fold_pool), gfp); + if (!pool) + goto out; + spin_lock_init(&pool->lock); + spin_lock_init(&pool->stale_lock); + pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); + for_each_possible_cpu(cpu) { + struct list_head *unbuddied = + per_cpu_ptr(pool->unbuddied, cpu); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&unbuddied[i]); + } + INIT_LIST_HEAD(&pool->lru); + INIT_LIST_HEAD(&pool->stale); + atomic64_set(&pool->pages_nr, 0); + pool->name = name; + pool->compact_wq = create_singlethread_workqueue(pool->name); + if (!pool->compact_wq) + goto out; + pool->release_wq = create_singlethread_workqueue(pool->name); + if (!pool->release_wq) + goto out_wq; + INIT_WORK(&pool->work, free_pages_work); + pool->ops = ops; + return pool; + +out_wq: + destroy_workqueue(pool->compact_wq); +out: + kfree(pool); + return NULL; +} + +/** + * z3fold_destroy_pool() - destroys an existing z3fold pool + * @pool: the z3fold pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +static void z3fold_destroy_pool(struct z3fold_pool *pool) +{ + destroy_workqueue(pool->release_wq); + destroy_workqueue(pool->compact_wq); + kfree(pool); +} + /** * z3fold_alloc() - allocates a region of a given size * @pool: z3fold pool from which to allocate @@ -371,8 +527,9 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, { int chunks = 0, i, freechunks; struct z3fold_header *zhdr = NULL; + struct page *page = NULL; enum buddy bud; - struct page *page; + bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM; if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; @@ -383,23 +540,57 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) bud = HEADLESS; else { + struct list_head *unbuddied; chunks = size_to_chunks(size); +lookup: /* First, try to find an unbuddied z3fold page. */ - zhdr = NULL; + unbuddied = get_cpu_ptr(pool->unbuddied); for_each_unbuddied_list(i, chunks) { - spin_lock(&pool->lock); - zhdr = list_first_entry_or_null(&pool->unbuddied[i], + struct list_head *l = &unbuddied[i]; + + zhdr = list_first_entry_or_null(READ_ONCE(l), struct z3fold_header, buddy); - if (!zhdr || !z3fold_page_trylock(zhdr)) { - spin_unlock(&pool->lock); + + if (!zhdr) continue; + + /* Re-check under lock. */ + spin_lock(&pool->lock); + l = &unbuddied[i]; + if (unlikely(zhdr != list_first_entry(READ_ONCE(l), + struct z3fold_header, buddy)) || + !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + put_cpu_ptr(pool->unbuddied); + goto lookup; } - kref_get(&zhdr->refcount); list_del_init(&zhdr->buddy); + zhdr->cpu = -1; spin_unlock(&pool->lock); page = virt_to_page(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; + put_cpu_ptr(pool->unbuddied); + if (can_sleep) + cond_resched(); + goto lookup; + } + + /* + * this page could not be removed from its unbuddied + * list while pool lock was held, and then we've taken + * page lock so kref_put could not be called before + * we got here, so it's safe to just call kref_get() + */ + kref_get(&zhdr->refcount); + break; + } + put_cpu_ptr(pool->unbuddied); + + if (zhdr) { if (zhdr->first_chunks == 0) { if (zhdr->middle_chunks != 0 && chunks >= zhdr->start_middle) @@ -411,32 +602,49 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, else if (zhdr->middle_chunks == 0) bud = MIDDLE; else { - z3fold_page_unlock(zhdr); - spin_lock(&pool->lock); if (kref_put(&zhdr->refcount, - release_z3fold_page)) + release_z3fold_page_locked)) atomic64_dec(&pool->pages_nr); - spin_unlock(&pool->lock); + else + z3fold_page_unlock(zhdr); pr_err("No free chunks in unbuddied\n"); WARN_ON(1); - continue; + goto lookup; } goto found; } bud = FIRST; } - /* Couldn't find unbuddied z3fold page, create new one */ - page = alloc_page(gfp); + spin_lock(&pool->stale_lock); + zhdr = list_first_entry_or_null(&pool->stale, + struct z3fold_header, buddy); + /* + * Before allocating a page, let's see if we can take one from the + * stale pages list. cancel_work_sync() can sleep so we must make + * sure it won't be called in case we're in atomic context. + */ + if (zhdr && (can_sleep || !work_pending(&zhdr->work) || + !unlikely(work_busy(&zhdr->work)))) { + list_del(&zhdr->buddy); + clear_bit(NEEDS_COMPACTING, &page->private); + spin_unlock(&pool->stale_lock); + if (can_sleep) + cancel_work_sync(&zhdr->work); + page = virt_to_page(zhdr); + } else { + spin_unlock(&pool->stale_lock); + page = alloc_page(gfp); + } + if (!page) return -ENOMEM; atomic64_inc(&pool->pages_nr); - zhdr = init_z3fold_page(page); + zhdr = init_z3fold_page(page, pool); if (bud == HEADLESS) { set_bit(PAGE_HEADLESS, &page->private); - spin_lock(&pool->lock); goto headless; } z3fold_page_lock(zhdr); @@ -451,15 +659,21 @@ found: zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; } - spin_lock(&pool->lock); if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || zhdr->middle_chunks == 0) { + struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); + /* Add to unbuddied list */ freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + spin_lock(&pool->lock); + list_add(&zhdr->buddy, &unbuddied[freechunks]); + spin_unlock(&pool->lock); + zhdr->cpu = smp_processor_id(); + put_cpu_ptr(pool->unbuddied); } headless: + spin_lock(&pool->lock); /* Add/move z3fold page to beginning of LRU */ if (!list_empty(&page->lru)) list_del(&page->lru); @@ -487,7 +701,6 @@ headless: static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) { struct z3fold_header *zhdr; - int freechunks; struct page *page; enum buddy bud; @@ -526,25 +739,27 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) spin_unlock(&pool->lock); free_z3fold_page(page); atomic64_dec(&pool->pages_nr); - } else { - if (zhdr->first_chunks != 0 || zhdr->middle_chunks != 0 || - zhdr->last_chunks != 0) { - z3fold_compact_page(zhdr); - /* Add to the unbuddied list */ - spin_lock(&pool->lock); - if (!list_empty(&zhdr->buddy)) - list_del(&zhdr->buddy); - freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); - spin_unlock(&pool->lock); - } + return; + } + + if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { + atomic64_dec(&pool->pages_nr); + return; + } + if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { z3fold_page_unlock(zhdr); + return; + } + if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { spin_lock(&pool->lock); - if (kref_put(&zhdr->refcount, release_z3fold_page)) - atomic64_dec(&pool->pages_nr); + list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); + zhdr->cpu = -1; + do_compact_page(zhdr, true); + return; } - + queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); + z3fold_page_unlock(zhdr); } /** @@ -585,9 +800,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) */ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) { - int i, ret = 0, freechunks; - struct z3fold_header *zhdr; - struct page *page; + int i, ret = 0; + struct z3fold_header *zhdr = NULL; + struct page *page = NULL; + struct list_head *pos; unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; spin_lock(&pool->lock); @@ -600,16 +816,24 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) spin_unlock(&pool->lock); return -EINVAL; } - page = list_last_entry(&pool->lru, struct page, lru); + list_for_each_prev(pos, &pool->lru) { + page = list_entry(pos, struct page, lru); + if (test_bit(PAGE_HEADLESS, &page->private)) + /* candidate found */ + break; + + zhdr = page_address(page); + if (!z3fold_page_trylock(zhdr)) + continue; /* can't evict at this point */ + kref_get(&zhdr->refcount); + list_del_init(&zhdr->buddy); + zhdr->cpu = -1; + } + list_del_init(&page->lru); + spin_unlock(&pool->lock); - zhdr = page_address(page); if (!test_bit(PAGE_HEADLESS, &page->private)) { - if (!list_empty(&zhdr->buddy)) - list_del_init(&zhdr->buddy); - kref_get(&zhdr->refcount); - spin_unlock(&pool->lock); - z3fold_page_lock(zhdr); /* * We need encode the handles before unlocking, since * we can race with free that will set @@ -624,11 +848,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) middle_handle = encode_handle(zhdr, MIDDLE); if (zhdr->last_chunks) last_handle = encode_handle(zhdr, LAST); + /* + * it's safe to unlock here because we hold a + * reference to this page + */ z3fold_page_unlock(zhdr); } else { first_handle = encode_handle(zhdr, HEADLESS); last_handle = middle_handle = 0; - spin_unlock(&pool->lock); } /* Issue the eviction callback(s) */ @@ -652,31 +879,12 @@ next: if (ret == 0) { free_z3fold_page(page); return 0; - } else { - spin_lock(&pool->lock); - } - } else { - z3fold_page_lock(zhdr); - if ((zhdr->first_chunks || zhdr->last_chunks || - zhdr->middle_chunks) && - !(zhdr->first_chunks && zhdr->last_chunks && - zhdr->middle_chunks)) { - z3fold_compact_page(zhdr); - /* add to unbuddied list */ - spin_lock(&pool->lock); - freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, - &pool->unbuddied[freechunks]); - spin_unlock(&pool->lock); - } - z3fold_page_unlock(zhdr); - spin_lock(&pool->lock); - if (kref_put(&zhdr->refcount, release_z3fold_page)) { - spin_unlock(&pool->lock); - atomic64_dec(&pool->pages_nr); - return 0; } + } else if (kref_put(&zhdr->refcount, release_z3fold_page)) { + atomic64_dec(&pool->pages_nr); + return 0; } + spin_lock(&pool->lock); /* * Add to the beginning of LRU. @@ -795,7 +1003,8 @@ static void *z3fold_zpool_create(const char *name, gfp_t gfp, { struct z3fold_pool *pool; - pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL); + pool = z3fold_create_pool(name, gfp, + zpool_ops ? &z3fold_zpool_ops : NULL); if (pool) { pool->zpool = zpool; pool->zpool_ops = zpool_ops; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 308acb9d814b..7c38e850a8fc 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -551,20 +551,23 @@ static int get_size_class_index(int size) return min_t(int, ZS_SIZE_CLASSES - 1, idx); } +/* type can be of enum type zs_stat_type or fullness_group */ static inline void zs_stat_inc(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) + int type, unsigned long cnt) { class->stats.objs[type] += cnt; } +/* type can be of enum type zs_stat_type or fullness_group */ static inline void zs_stat_dec(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) + int type, unsigned long cnt) { class->stats.objs[type] -= cnt; } +/* type can be of enum type zs_stat_type or fullness_group */ static inline unsigned long zs_stat_get(struct size_class *class, - enum zs_stat_type type) + int type) { return class->stats.objs[type]; } @@ -1969,6 +1972,14 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage, unsigned int obj_idx; int ret = -EAGAIN; + /* + * We cannot support the _NO_COPY case here, because copy needs to + * happen under the zs lock, which does not work with + * MIGRATE_SYNC_NO_COPY workflow. + */ + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); @@ -1983,8 +1994,11 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage, spin_lock(&class->lock); if (!get_zspage_inuse(zspage)) { - ret = -EBUSY; - goto unlock_class; + /* + * Set "offset" to end of the page so that every loops + * skips unnecessary object scanning. + */ + offset = PAGE_SIZE; } pos = offset; @@ -2052,7 +2066,6 @@ unpin_objects: } } kunmap_atomic(s_addr); -unlock_class: spin_unlock(&class->lock); migrate_write_unlock(zspage); |