From 8edf344c66a3f214d709dad1421c29d678915b3f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 28 May 2010 09:29:15 +0900 Subject: hugetlb: move definition of is_vm_hugetlb_page() to hugepage_inline.h is_vm_hugetlb_page() is a widely used inline function to insert hooks into hugetlb code. But we can't use it in pagemap.h because of circular dependency of the header files. This patch removes this limitation. Acked-by: Mel Gorman Acked-by: Fengguang Wu Signed-off-by: Naoya Horiguchi Signed-off-by: Andi Kleen --- include/linux/hugetlb.h | 11 +---------- include/linux/hugetlb_inline.h | 22 ++++++++++++++++++++++ include/linux/pagemap.h | 1 + 3 files changed, 24 insertions(+), 10 deletions(-) create mode 100644 include/linux/hugetlb_inline.h (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 78b4bc64c006..d47a7c41745d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -2,6 +2,7 @@ #define _LINUX_HUGETLB_H #include +#include struct ctl_table; struct user_struct; @@ -14,11 +15,6 @@ struct user_struct; int PageHuge(struct page *page); -static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) -{ - return vma->vm_flags & VM_HUGETLB; -} - void reset_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); @@ -77,11 +73,6 @@ static inline int PageHuge(struct page *page) return 0; } -static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) -{ - return 0; -} - static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { } diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h new file mode 100644 index 000000000000..cf00b6df53dc --- /dev/null +++ b/include/linux/hugetlb_inline.h @@ -0,0 +1,22 @@ +#ifndef _LINUX_HUGETLB_INLINE_H +#define _LINUX_HUGETLB_INLINE_H 1 + +#ifdef CONFIG_HUGETLBFS + +#include + +static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_HUGETLB; +} + +#else + +static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return 0; +} + +#endif + +#endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3c62ed408492..b2bd2bae9775 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -13,6 +13,7 @@ #include #include #include /* for in_interrupt() */ +#include /* * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page -- cgit v1.2.3 From 0fe6e20b9c4c53b3e97096ee73a0857f60aad43f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 28 May 2010 09:29:16 +0900 Subject: hugetlb, rmap: add reverse mapping for hugepage This patch adds reverse mapping feature for hugepage by introducing mapcount for shared/private-mapped hugepage and anon_vma for private-mapped hugepage. While hugepage is not currently swappable, reverse mapping can be useful for memory error handler. Without this patch, memory error handler cannot identify processes using the bad hugepage nor unmap it from them. That is: - for shared hugepage: we can collect processes using a hugepage through pagecache, but can not unmap the hugepage because of the lack of mapcount. - for privately mapped hugepage: we can neither collect processes nor unmap the hugepage. This patch solves these problems. This patch include the bug fix given by commit 23be7468e8, so reverts it. Dependency: "hugetlb: move definition of is_vm_hugetlb_page() to hugepage_inline.h" ChangeLog since May 24. - create hugetlb_inline.h and move is_vm_hugetlb_index() in it. - move functions setting up anon_vma for hugepage into mm/rmap.c. ChangeLog since May 13. - rebased to 2.6.34 - fix logic error (in case that private mapping and shared mapping coexist) - move is_vm_hugetlb_page() into include/linux/mm.h to use this function from linear_page_index() - define and use linear_hugepage_index() instead of compound_order() - use page_move_anon_rmap() in hugetlb_cow() - copy exclusive switch of __set_page_anon_rmap() into hugepage counterpart. - revert commit 24be7468 completely Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Andrew Morton Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Larry Woodman Cc: Lee Schermerhorn Acked-by: Fengguang Wu Acked-by: Mel Gorman Signed-off-by: Andi Kleen --- include/linux/hugetlb.h | 1 + include/linux/pagemap.h | 8 ++++++- include/linux/poison.h | 9 -------- include/linux/rmap.h | 5 +++++ mm/hugetlb.c | 44 ++++++++++++++++++++++++++++++++++-- mm/rmap.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 114 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d47a7c41745d..e688fd89354d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -99,6 +99,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) +#define huge_pte_offset(mm, address) 0 #define hugetlb_change_protection(vma, address, end, newprot) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b2bd2bae9775..a547d9689170 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -282,10 +282,16 @@ static inline loff_t page_offset(struct page *page) return ((loff_t)page->index) << PAGE_CACHE_SHIFT; } +extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, + unsigned long address); + static inline pgoff_t linear_page_index(struct vm_area_struct *vma, unsigned long address) { - pgoff_t pgoff = (address - vma->vm_start) >> PAGE_SHIFT; + pgoff_t pgoff; + if (unlikely(is_vm_hugetlb_page(vma))) + return linear_hugepage_index(vma, address); + pgoff = (address - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT); } diff --git a/include/linux/poison.h b/include/linux/poison.h index 34066ffd893d..2110a81c5e2a 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -48,15 +48,6 @@ #define POISON_FREE 0x6b /* for use-after-free poisoning */ #define POISON_END 0xa5 /* end-byte of poisoning */ -/********** mm/hugetlb.c **********/ -/* - * Private mappings of hugetlb pages use this poisoned value for - * page->mapping. The core VM should not be doing anything with this mapping - * but futex requires the existence of some page->mapping value even though it - * is unused if PAGE_MAPPING_ANON is set. - */ -#define HUGETLB_POISON ((void *)(0x00300300 + POISON_POINTER_DELTA + PAGE_MAPPING_ANON)) - /********** arch/$ARCH/mm/init.c **********/ #define POISON_FREE_INITMEM 0xcc diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 77216742c178..9d50e7ef5f5a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -140,6 +140,11 @@ void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned lon void page_add_file_rmap(struct page *); void page_remove_rmap(struct page *); +void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long); +void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long); + static inline void page_dup_rmap(struct page *page) { atomic_inc(&page->_mapcount); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 54d42b009dbe..aa3c51739378 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -220,6 +221,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } +pgoff_t linear_hugepage_index(struct vm_area_struct *vma, + unsigned long address) +{ + return vma_hugecache_offset(hstate_vma(vma), vma, address); +} + /* * Return the size of the pages allocated when backing a VMA. In the majority * cases this will be same size as used by the page table entries. @@ -552,6 +559,7 @@ static void free_huge_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); + BUG_ON(page_mapcount(page)); INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); @@ -2129,6 +2137,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); + page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); @@ -2207,6 +2216,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { + page_remove_rmap(page); list_del(&page->lru); put_page(page); } @@ -2272,6 +2282,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, return 1; } +/* + * Hugetlb_cow() should be called with page lock of the original hugepage held. + */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, struct page *pagecache_page) @@ -2286,8 +2299,11 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ - avoidcopy = (page_count(old_page) == 1); + avoidcopy = (page_mapcount(old_page) == 1); if (avoidcopy) { + if (!trylock_page(old_page)) + if (PageAnon(old_page)) + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -2338,6 +2354,13 @@ retry_avoidcopy: return -PTR_ERR(new_page); } + /* + * When the original hugepage is shared one, it does not have + * anon_vma prepared. + */ + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + copy_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); @@ -2352,6 +2375,8 @@ retry_avoidcopy: huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); + page_remove_rmap(old_page); + hugepage_add_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; } @@ -2452,10 +2477,17 @@ retry: spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); + page_dup_rmap(page); } else { lock_page(page); - page->mapping = HUGETLB_POISON; + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + hugepage_add_new_anon_rmap(page, vma, address); } + } else { + page_dup_rmap(page); } /* @@ -2507,6 +2539,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *page = NULL; struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); @@ -2548,6 +2581,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } + if (!pagecache_page) { + page = pte_page(entry); + lock_page(page); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) @@ -2573,6 +2611,8 @@ out_page_table_lock: if (pagecache_page) { unlock_page(pagecache_page); put_page(pagecache_page); + } else { + unlock_page(page); } out_mutex: diff --git a/mm/rmap.c b/mm/rmap.c index 38a336e2eea1..0ad53572eaf2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -56,6 +56,7 @@ #include #include #include +#include #include @@ -326,6 +327,8 @@ vma_address(struct page *page, struct vm_area_struct *vma) pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); unsigned long address; + if (unlikely(is_vm_hugetlb_page(vma))) + pgoff = page->index << huge_page_order(page_hstate(page)); address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { /* page should be within @vma mapping range */ @@ -369,6 +372,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, pte_t *pte; spinlock_t *ptl; + if (unlikely(PageHuge(page))) { + pte = huge_pte_offset(mm, address); + ptl = &mm->page_table_lock; + goto check; + } + pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) return NULL; @@ -389,6 +398,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, } ptl = pte_lockptr(mm, pmd); +check: spin_lock(ptl); if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { *ptlp = ptl; @@ -873,6 +883,12 @@ void page_remove_rmap(struct page *page) page_clear_dirty(page); set_page_dirty(page); } + /* + * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED + * and not charged by memcg for now. + */ + if (unlikely(PageHuge(page))) + return; if (PageAnon(page)) { mem_cgroup_uncharge_page(page); __dec_zone_page_state(page, NR_ANON_PAGES); @@ -1445,3 +1461,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *, return rmap_walk_file(page, rmap_one, arg); } #endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_HUGETLBFS +/* + * The following three functions are for anonymous (private mapped) hugepages. + * Unlike common anonymous pages, anonymous hugepages have no accounting code + * and no lru code, because we handle hugepages differently from common pages. + */ +static void __hugepage_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ + struct anon_vma *anon_vma = vma->anon_vma; + BUG_ON(!anon_vma); + if (!exclusive) { + struct anon_vma_chain *avc; + avc = list_entry(vma->anon_vma_chain.prev, + struct anon_vma_chain, same_vma); + anon_vma = avc->anon_vma; + } + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; + page->index = linear_page_index(vma, address); +} + +void hugepage_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + int first; + BUG_ON(!anon_vma); + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + first = atomic_inc_and_test(&page->_mapcount); + if (first) + __hugepage_set_anon_rmap(page, vma, address, 0); +} + +void hugepage_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + atomic_set(&page->_mapcount, 0); + __hugepage_set_anon_rmap(page, vma, address, 1); +} +#endif /* CONFIG_HUGETLBFS */ -- cgit v1.2.3 From 93f70f900da36fbc19c13c2aa04b2e468c8d00fb Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 28 May 2010 09:29:20 +0900 Subject: HWPOISON, hugetlb: isolate corrupted hugepage If error hugepage is not in-use, we can fully recovery from error by dequeuing it from freelist, so return RECOVERY. Otherwise whether or not we can recovery depends on user processes, so return DELAYED. Dependency: "HWPOISON, hugetlb: enable error handling path for hugepage" Signed-off-by: Naoya Horiguchi Cc: Andrew Morton Acked-by: Fengguang Wu Signed-off-by: Andi Kleen --- include/linux/hugetlb.h | 2 ++ mm/hugetlb.c | 16 ++++++++++++++++ mm/memory-failure.c | 28 ++++++++++++++++++++-------- 3 files changed, 38 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e688fd89354d..f479700df61b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -43,6 +43,7 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, int acctflags); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); +void __isolate_hwpoisoned_huge_page(struct page *page); extern unsigned long hugepages_treat_as_movable; extern const unsigned long hugetlb_zero, hugetlb_infinity; @@ -100,6 +101,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define huge_pte_offset(mm, address) 0 +#define __isolate_hwpoisoned_huge_page(page) 0 #define hugetlb_change_protection(vma, address, end, newprot) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aa3c51739378..8c163f64cf10 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2825,3 +2825,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) hugetlb_put_quota(inode->i_mapping, (chg - freed)); hugetlb_acct_memory(h, -(chg - freed)); } + +/* + * This function is called from memory failure code. + * Assume the caller holds page lock of the head page. + */ +void __isolate_hwpoisoned_huge_page(struct page *hpage) +{ + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + + spin_lock(&hugetlb_lock); + list_del(&hpage->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + spin_unlock(&hugetlb_lock); +} diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 473f15a3356d..d0b420aba726 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -690,17 +690,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) /* * Huge pages. Needs work. * Issues: - * No rmap support so we cannot find the original mapper. In theory could walk - * all MMs and look for the mappings, but that would be non atomic and racy. - * Need rmap for hugepages for this. Alternatively we could employ a heuristic, - * like just walking the current process and hoping it has it mapped (that - * should be usually true for the common "shared database cache" case) - * Should handle free huge pages and dequeue them too, but this needs to - * handle huge page accounting correctly. + * - Error on hugepage is contained in hugepage unit (not in raw page unit.) + * To narrow down kill region to one page, we need to break up pmd. + * - To support soft-offlining for hugepage, we need to support hugepage + * migration. */ static int me_huge_page(struct page *p, unsigned long pfn) { - return FAILED; + struct page *hpage = compound_head(p); + /* + * We can safely recover from error on free or reserved (i.e. + * not in-use) hugepage by dequeuing it from freelist. + * To check whether a hugepage is in-use or not, we can't use + * page->lru because it can be used in other hugepage operations, + * such as __unmap_hugepage_range() and gather_surplus_pages(). + * So instead we use page_mapping() and PageAnon(). + * We assume that this function is called with page lock held, + * so there is no race between isolation and mapping/unmapping. + */ + if (!(page_mapping(hpage) || PageAnon(hpage))) { + __isolate_hwpoisoned_huge_page(hpage); + return RECOVERED; + } + return DELAYED; } /* -- cgit v1.2.3 From e3390f67a7267daa227380b6f1bbf13c7ddd4aff Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 15 Jun 2010 13:18:13 +0900 Subject: hwpoison: rename CONFIG CONFIG_HUGETLBFS controls hugetlbfs interface code. OTOH, CONFIG_HUGETLB_PAGE controls hugepage management code. So we should use CONFIG_HUGETLB_PAGE here. Signed-off-by: Naoya Horiguchi Signed-off-by: Andi Kleen --- include/linux/hugetlb_inline.h | 4 ++-- mm/rmap.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index cf00b6df53dc..6931489a5c14 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -1,7 +1,7 @@ #ifndef _LINUX_HUGETLB_INLINE_H -#define _LINUX_HUGETLB_INLINE_H 1 +#define _LINUX_HUGETLB_INLINE_H -#ifdef CONFIG_HUGETLBFS +#ifdef CONFIG_HUGETLB_PAGE #include diff --git a/mm/rmap.c b/mm/rmap.c index 0ad53572eaf2..71bd30a147cf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1462,7 +1462,7 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *, } #endif /* CONFIG_MIGRATION */ -#ifdef CONFIG_HUGETLBFS +#ifdef CONFIG_HUGETLB_PAGE /* * The following three functions are for anonymous (private mapped) hugepages. * Unlike common anonymous pages, anonymous hugepages have no accounting code @@ -1503,4 +1503,4 @@ void hugepage_add_new_anon_rmap(struct page *page, atomic_set(&page->_mapcount, 0); __hugepage_set_anon_rmap(page, vma, address, 1); } -#endif /* CONFIG_HUGETLBFS */ +#endif /* CONFIG_HUGETLB_PAGE */ -- cgit v1.2.3