From 1d798ca3f16437c71ff63e36597ff07f9c12e4d6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 6 Nov 2015 16:29:54 -0800 Subject: mm: make compound_head() robust Hugh has pointed that compound_head() call can be unsafe in some context. There's one example: CPU0 CPU1 isolate_migratepages_block() page_count() compound_head() !!PageTail() == true put_page() tail->first_page = NULL head = tail->first_page alloc_pages(__GFP_COMP) prep_compound_page() tail->first_page = head __SetPageTail(p); !!PageTail() == true The race is pure theoretical. I don't it's possible to trigger it in practice. But who knows. We can fix the race by changing how encode PageTail() and compound_head() within struct page to be able to update them in one shot. The patch introduces page->compound_head into third double word block in front of compound_dtor and compound_order. Bit 0 encodes PageTail() and the rest bits are pointer to head page if bit zero is set. The patch moves page->pmd_huge_pte out of word, just in case if an architecture defines pgtable_t into something what can have the bit 0 set. hugetlb_cgroup uses page->lru.next in the second tail page to store pointer struct hugetlb_cgroup. The patch switch it to use page->private in the second tail page instead. The space is free since ->first_page is removed from the union. The patch also opens possibility to remove HUGETLB_CGROUP_MIN_ORDER limitation, since there's now space in first tail page to store struct hugetlb_cgroup pointer. But that's out of scope of the patch. That means page->compound_head shares storage space with: - page->lru.next; - page->next; - page->rcu_head.next; That's too long list to be absolutely sure, but looks like nobody uses bit 0 of the word. page->rcu_head.next guaranteed[1] to have bit 0 clean as long as we use call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu(). But future call_rcu_lazy() is not allowed as it makes use of the bit and we can get false positive PageTail(). [1] http://lkml.kernel.org/g/20150827163634.GD4029@linux.vnet.ibm.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Reviewed-by: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Cc: Vlastimil Babka Acked-by: Paul E. McKenney Cc: Aneesh Kumar K.V Cc: Andi Kleen Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 12 ------------ mm/debug.c | 5 ----- mm/huge_memory.c | 3 +-- mm/hugetlb.c | 8 ++------ mm/hugetlb_cgroup.c | 2 +- mm/internal.h | 4 ++-- mm/memory-failure.c | 7 ------- mm/page_alloc.c | 48 +++++++++++++++++++++++++++++++----------------- mm/swap.c | 4 ++-- 9 files changed, 39 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 0d9fdcd01e47..97a4e06b15c0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -200,18 +200,6 @@ config MEMORY_HOTREMOVE depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION -# -# If we have space for more page flags then we can enable additional -# optimizations and functionality. -# -# Regular Sparsemem takes page flag bits for the sectionid if it does not -# use a virtual memmap. Disable extended page flags for 32 bit platforms -# that require the use of a sectionid in the page flags. -# -config PAGEFLAGS_EXTENDED - def_bool y - depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM - # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. diff --git a/mm/debug.c b/mm/debug.c index e784110fb51d..668aa35191ca 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -25,12 +25,7 @@ static const struct trace_print_flags pageflag_names[] = { {1UL << PG_private, "private" }, {1UL << PG_private_2, "private_2" }, {1UL << PG_writeback, "writeback" }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED {1UL << PG_head, "head" }, - {1UL << PG_tail, "tail" }, -#else - {1UL << PG_compound, "compound" }, -#endif {1UL << PG_swapcache, "swapcache" }, {1UL << PG_mappedtodisk, "mappedtodisk" }, {1UL << PG_reclaim, "reclaim" }, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 73266ee7274c..e1ccc83f73d3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1755,8 +1755,7 @@ static void __split_huge_page_refcount(struct page *page, (1L << PG_unevictable))); page_tail->flags |= (1L << PG_dirty); - /* clear PageTail before overwriting first_page */ - smp_wmb(); + clear_compound_head(page_tail); if (page_is_young(page)) set_page_young(page_tail); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e90a29024c5c..4eb0f0964883 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1001,9 +1001,8 @@ static void destroy_compound_gigantic_page(struct page *page, struct page *p = page + 1; for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __ClearPageTail(p); + clear_compound_head(p); set_page_refcounted(p); - p->first_page = NULL; } set_compound_order(page, 0); @@ -1276,10 +1275,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) */ __ClearPageReserved(p); set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + set_compound_head(p, page); } } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 33d59abe91f1..d8fb10de0f14 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -385,7 +385,7 @@ void __init hugetlb_cgroup_file_init(void) /* * Add cgroup control files only if the huge page consists * of more than two normal pages. This is because we use - * page[2].lru.next for storing cgroup details. + * page[2].private for storing cgroup details. */ if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) __hugetlb_cgroup_file_init(hstate_index(h)); diff --git a/mm/internal.h b/mm/internal.h index 5b7841f6fa27..a7f5670fea23 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -80,9 +80,9 @@ static inline void __get_page_tail_foll(struct page *page, * speculative page access (like in * page_cache_get_speculative()) on tail pages. */ - VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page); + VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page); if (get_page_head) - atomic_inc(&page->first_page->_count); + atomic_inc(&compound_head(page)->_count); get_huge_page_tail(page); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 16a0ec385320..8424b64711ac 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -776,8 +776,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define lru (1UL << PG_lru) #define swapbacked (1UL << PG_swapbacked) #define head (1UL << PG_head) -#define tail (1UL << PG_tail) -#define compound (1UL << PG_compound) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) @@ -800,12 +798,7 @@ static struct page_state { */ { slab, slab, MF_MSG_SLAB, me_kernel }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED { head, head, MF_MSG_HUGE, me_huge_page }, - { tail, tail, MF_MSG_HUGE, me_huge_page }, -#else - { compound, compound, MF_MSG_HUGE, me_huge_page }, -#endif { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fae1bd6f9f37..e361001519d3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -445,15 +445,15 @@ out: /* * Higher-order pages are called "compound pages". They are structured thusly: * - * The first PAGE_SIZE page is called the "head page". + * The first PAGE_SIZE page is called the "head page" and have PG_head set. * - * The remaining PAGE_SIZE pages are called "tail pages". + * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded + * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * - * All pages have PG_compound set. All tail pages have their ->first_page - * pointing at the head page. + * The first tail page's ->compound_dtor holds the offset in array of compound + * page destructors. See compound_page_dtors. * - * The first tail page's ->lru.next holds the address of the compound page's - * put_page() function. Its ->lru.prev holds the order of allocation. + * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ @@ -473,10 +473,7 @@ void prep_compound_page(struct page *page, unsigned long order) for (i = 1; i < nr_pages; i++) { struct page *p = page + i; set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + set_compound_head(p, page); } } @@ -854,17 +851,30 @@ static void free_one_page(struct zone *zone, static int free_tail_pages_check(struct page *head_page, struct page *page) { - if (!IS_ENABLED(CONFIG_DEBUG_VM)) - return 0; + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; + } if (unlikely(!PageTail(page))) { bad_page(page, "PageTail not set", 0); - return 1; + goto out; } - if (unlikely(page->first_page != head_page)) { - bad_page(page, "first_page not consistent", 0); - return 1; + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent", 0); + goto out; } - return 0; + ret = 0; +out: + clear_compound_head(page); + return ret; } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -931,6 +941,10 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) struct page *page = pfn_to_page(start_pfn); init_reserved_page(start_pfn); + + /* Avoid false-positive PageTail() */ + INIT_LIST_HEAD(&page->lru); + SetPageReserved(page); } } diff --git a/mm/swap.c b/mm/swap.c index 983f692a47fd..39395fb549c0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -201,7 +201,7 @@ out_put_single: __put_single_page(page); return; } - VM_BUG_ON_PAGE(page_head != page->first_page, page); + VM_BUG_ON_PAGE(page_head != compound_head(page), page); /* * We can release the refcount taken by * get_page_unless_zero() now that @@ -262,7 +262,7 @@ static void put_compound_page(struct page *page) * Case 3 is possible, as we may race with * __split_huge_page_refcount tearing down a THP page. */ - page_head = compound_head_by_tail(page); + page_head = compound_head(page); if (!__compound_tail_refcounted(page_head)) put_unrefcounted_compound_page(page_head, page); else -- cgit v1.2.3