diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2016-01-15 16:52:56 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-15 17:56:32 -0800 |
commit | ddc58f27f9eee9117219936f77e90ad5b2e00e96 (patch) | |
tree | a403f6b83ab95c297f6f6ab03a4756e2ea285d63 /mm/swap.c | |
parent | ad0bed24e98bcae9952c2d1f663ec7cb6344a387 (diff) |
mm: drop tail page refcounting
Tail page refcounting is utterly complicated and painful to support.
It uses ->_mapcount on tail pages to store how many times this page is
pinned. get_page() bumps ->_mapcount on tail page in addition to
->_count on head. This information is required by split_huge_page() to
be able to distribute pins from head of compound page to tails during
the split.
We will need ->_mapcount to account PTE mappings of subpages of the
compound page. We eliminate need in current meaning of ->_mapcount in
tail pages by forbidding split entirely if the page is pinned.
The only user of tail page refcounting is THP which is marked BROKEN for
now.
Let's drop all this mess. It makes get_page() and put_page() much
simpler.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swap.c')
-rw-r--r-- | mm/swap.c | 273 |
1 files changed, 14 insertions, 259 deletions
diff --git a/mm/swap.c b/mm/swap.c index 39395fb549c0..3d65480422e8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -89,260 +89,14 @@ static void __put_compound_page(struct page *page) (*dtor)(page); } -/** - * Two special cases here: we could avoid taking compound_lock_irqsave - * and could skip the tail refcounting(in _mapcount). - * - * 1. Hugetlbfs page: - * - * PageHeadHuge will remain true until the compound page - * is released and enters the buddy allocator, and it could - * not be split by __split_huge_page_refcount(). - * - * So if we see PageHeadHuge set, and we have the tail page pin, - * then we could safely put head page. - * - * 2. Slab THP page: - * - * PG_slab is cleared before the slab frees the head page, and - * tail pin cannot be the last reference left on the head page, - * because the slab code is free to reuse the compound page - * after a kfree/kmem_cache_free without having to check if - * there's any tail pin left. In turn all tail pinsmust be always - * released while the head is still pinned by the slab code - * and so we know PG_slab will be still set too. - * - * So if we see PageSlab set, and we have the tail page pin, - * then we could safely put head page. - */ -static __always_inline -void put_unrefcounted_compound_page(struct page *page_head, struct page *page) -{ - /* - * If @page is a THP tail, we must read the tail page - * flags after the head page flags. The - * __split_huge_page_refcount side enforces write memory barriers - * between clearing PageTail and before the head page - * can be freed and reallocated. - */ - smp_rmb(); - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount cannot race - * here, see the comment above this function. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - if (put_page_testzero(page_head)) { - /* - * If this is the tail of a slab THP page, - * the tail pin must not be the last reference - * held on the page, because the PG_slab cannot - * be cleared before all tail pins (which skips - * the _mapcount tail refcounting) have been - * released. - * - * If this is the tail of a hugetlbfs page, - * the tail pin may be the last reference on - * the page instead, because PageHeadHuge will - * not go away until the compound page enters - * the buddy allocator. - */ - VM_BUG_ON_PAGE(PageSlab(page_head), page_head); - __put_compound_page(page_head); - } - } else - /* - * __split_huge_page_refcount run before us, - * @page was a THP tail. The split @page_head - * has been freed and reallocated as slab or - * hugetlbfs page of smaller order (only - * possible if reallocated as slab on x86). - */ - if (put_page_testzero(page)) - __put_single_page(page); -} - -static __always_inline -void put_refcounted_compound_page(struct page *page_head, struct page *page) -{ - if (likely(page != page_head && get_page_unless_zero(page_head))) { - unsigned long flags; - - /* - * @page_head wasn't a dangling pointer but it may not - * be a head page anymore by the time we obtain the - * lock. That is ok as long as it can't be freed from - * under us. - */ - flags = compound_lock_irqsave(page_head); - if (unlikely(!PageTail(page))) { - /* __split_huge_page_refcount run before us */ - compound_unlock_irqrestore(page_head, flags); - if (put_page_testzero(page_head)) { - /* - * The @page_head may have been freed - * and reallocated as a compound page - * of smaller order and then freed - * again. All we know is that it - * cannot have become: a THP page, a - * compound page of higher order, a - * tail page. That is because we - * still hold the refcount of the - * split THP tail and page_head was - * the THP head before the split. - */ - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } -out_put_single: - if (put_page_testzero(page)) - __put_single_page(page); - return; - } - VM_BUG_ON_PAGE(page_head != compound_head(page), page); - /* - * We can release the refcount taken by - * get_page_unless_zero() now that - * __split_huge_page_refcount() is blocked on the - * compound_lock. - */ - if (put_page_testzero(page_head)) - VM_BUG_ON_PAGE(1, page_head); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); - atomic_dec(&page->_mapcount); - VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); - VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - compound_unlock_irqrestore(page_head, flags); - - if (put_page_testzero(page_head)) { - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } - } else { - /* @page_head is a dangling pointer */ - VM_BUG_ON_PAGE(PageTail(page), page); - goto out_put_single; - } -} - -static void put_compound_page(struct page *page) -{ - struct page *page_head; - - /* - * We see the PageCompound set and PageTail not set, so @page maybe: - * 1. hugetlbfs head page, or - * 2. THP head page. - */ - if (likely(!PageTail(page))) { - if (put_page_testzero(page)) { - /* - * By the time all refcounts have been released - * split_huge_page cannot run anymore from under us. - */ - if (PageHead(page)) - __put_compound_page(page); - else - __put_single_page(page); - } - return; - } - - /* - * We see the PageCompound set and PageTail set, so @page maybe: - * 1. a tail hugetlbfs page, or - * 2. a tail THP page, or - * 3. a split THP page. - * - * Case 3 is possible, as we may race with - * __split_huge_page_refcount tearing down a THP page. - */ - page_head = compound_head(page); - if (!__compound_tail_refcounted(page_head)) - put_unrefcounted_compound_page(page_head, page); - else - put_refcounted_compound_page(page_head, page); -} - -void put_page(struct page *page) +void __put_page(struct page *page) { if (unlikely(PageCompound(page))) - put_compound_page(page); - else if (put_page_testzero(page)) + __put_compound_page(page); + else __put_single_page(page); } -EXPORT_SYMBOL(put_page); - -/* - * This function is exported but must not be called by anything other - * than get_page(). It implements the slow path of get_page(). - */ -bool __get_page_tail(struct page *page) -{ - /* - * This takes care of get_page() if run on a tail page - * returned by one of the get_user_pages/follow_page variants. - * get_user_pages/follow_page itself doesn't need the compound - * lock because it runs __get_page_tail_foll() under the - * proper PT lock that already serializes against - * split_huge_page(). - */ - unsigned long flags; - bool got; - struct page *page_head = compound_head(page); - - /* Ref to put_compound_page() comment. */ - if (!__compound_tail_refcounted(page_head)) { - smp_rmb(); - if (likely(PageTail(page))) { - /* - * This is a hugetlbfs page or a slab - * page. __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - __get_page_tail_foll(page, true); - return true; - } else { - /* - * __split_huge_page_refcount run - * before us, "page" was a THP - * tail. The split page_head has been - * freed and reallocated as slab or - * hugetlbfs page of smaller order - * (only possible if reallocated as - * slab on x86). - */ - return false; - } - } - - got = false; - if (likely(page != page_head && get_page_unless_zero(page_head))) { - /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. - */ - flags = compound_lock_irqsave(page_head); - /* here __split_huge_page_refcount won't run anymore */ - if (likely(PageTail(page))) { - __get_page_tail_foll(page, false); - got = true; - } - compound_unlock_irqrestore(page_head, flags); - if (unlikely(!got)) - put_page(page_head); - } - return got; -} -EXPORT_SYMBOL(__get_page_tail); +EXPORT_SYMBOL(__put_page); /** * put_pages_list() - release a list of pages @@ -918,15 +672,6 @@ void release_pages(struct page **pages, int nr, bool cold) for (i = 0; i < nr; i++) { struct page *page = pages[i]; - if (unlikely(PageCompound(page))) { - if (zone) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - zone = NULL; - } - put_compound_page(page); - continue; - } - /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the @@ -937,9 +682,19 @@ void release_pages(struct page **pages, int nr, bool cold) zone = NULL; } + page = compound_head(page); if (!put_page_testzero(page)) continue; + if (PageCompound(page)) { + if (zone) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = NULL; + } + __put_compound_page(page); + continue; + } + if (PageLRU(page)) { struct zone *pagezone = page_zone(page); |