From 7bbaf27d9c83037b6e60a818e57bdbedf6bc15be Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 5 Apr 2018 16:18:18 -0700 Subject: zboot: fix stack protector in compressed boot phase Calling __stack_chk_guard_setup() in decompress_kernel() is too late that stack checking always fails for decompress_kernel() itself. So remove __stack_chk_guard_setup() and initialize __stack_chk_guard before we call decompress_kernel(). Original code comes from ARM but also used for MIPS and SH, so fix them together. If without this fix, compressed booting of these archs will fail because stack checking is enabled by default (>=4.16). Link: http://lkml.kernel.org/r/1522226933-29317-1-git-send-email-chenhc@lemote.com Fixes: 8779657d29c0 ("stackprotector: Introduce CONFIG_CC_STACKPROTECTOR_STRONG") Signed-off-by: Huacai Chen Acked-by: James Hogan Acked-by: Kees Cook Acked-by: Rich Felker Cc: Ralf Baechle Cc: Russell King Cc: Yoshinori Sato Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/boot/compressed/misc.c | 9 +-------- arch/mips/boot/compressed/decompress.c | 9 +-------- arch/sh/boot/compressed/misc.c | 9 +-------- 3 files changed, 3 insertions(+), 24 deletions(-) diff --git a/arch/arm/boot/compressed/misc.c b/arch/arm/boot/compressed/misc.c index 16a8a804e958..e8fe51f4e97a 100644 --- a/arch/arm/boot/compressed/misc.c +++ b/arch/arm/boot/compressed/misc.c @@ -128,12 +128,7 @@ asmlinkage void __div0(void) error("Attempting division by 0!"); } -unsigned long __stack_chk_guard; - -void __stack_chk_guard_setup(void) -{ - __stack_chk_guard = 0x000a0dff; -} +const unsigned long __stack_chk_guard = 0x000a0dff; void __stack_chk_fail(void) { @@ -150,8 +145,6 @@ decompress_kernel(unsigned long output_start, unsigned long free_mem_ptr_p, { int ret; - __stack_chk_guard_setup(); - output_data = (unsigned char *)output_start; free_mem_ptr = free_mem_ptr_p; free_mem_end_ptr = free_mem_ptr_end_p; diff --git a/arch/mips/boot/compressed/decompress.c b/arch/mips/boot/compressed/decompress.c index fdf99e9dd4c3..81df9047e110 100644 --- a/arch/mips/boot/compressed/decompress.c +++ b/arch/mips/boot/compressed/decompress.c @@ -76,12 +76,7 @@ void error(char *x) #include "../../../../lib/decompress_unxz.c" #endif -unsigned long __stack_chk_guard; - -void __stack_chk_guard_setup(void) -{ - __stack_chk_guard = 0x000a0dff; -} +const unsigned long __stack_chk_guard = 0x000a0dff; void __stack_chk_fail(void) { @@ -92,8 +87,6 @@ void decompress_kernel(unsigned long boot_heap_start) { unsigned long zimage_start, zimage_size; - __stack_chk_guard_setup(); - zimage_start = (unsigned long)(&__image_begin); zimage_size = (unsigned long)(&__image_end) - (unsigned long)(&__image_begin); diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c index 627ce8e75e01..c15cac9251b9 100644 --- a/arch/sh/boot/compressed/misc.c +++ b/arch/sh/boot/compressed/misc.c @@ -104,12 +104,7 @@ static void error(char *x) while(1); /* Halt */ } -unsigned long __stack_chk_guard; - -void __stack_chk_guard_setup(void) -{ - __stack_chk_guard = 0x000a0dff; -} +const unsigned long __stack_chk_guard = 0x000a0dff; void __stack_chk_fail(void) { @@ -130,8 +125,6 @@ void decompress_kernel(void) { unsigned long output_addr; - __stack_chk_guard_setup(); - #ifdef CONFIG_SUPERH64 output_addr = (CONFIG_MEMORY_START + 0x2000); #else -- cgit v1.2.3 From 5df63c2a149ae65a9ec239e7c2af44efa6f79beb Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 5 Apr 2018 16:18:21 -0700 Subject: hugetlbfs: fix bug in pgoff overflow checking This is a fix for a regression in 32 bit kernels caused by an invalid check for pgoff overflow in hugetlbfs mmap setup. The check incorrectly specified that the size of a loff_t was the same as the size of a long. The regression prevents mapping hugetlbfs files at offsets greater than 4GB on 32 bit kernels. On 32 bit kernels conversion from a page based unsigned long can not overflow a loff_t byte offset. Therefore, skip this check if sizeof(unsigned long) != sizeof(loff_t). Link: http://lkml.kernel.org/r/20180330145402.5053-1-mike.kravetz@oracle.com Fixes: 63489f8e8211 ("hugetlbfs: check for pgoff value overflow") Reported-by: Dan Rue Signed-off-by: Mike Kravetz Tested-by: Anders Roxell Cc: Michal Hocko Cc: Yisheng Xie Cc: "Kirill A . Shutemov" Cc: Nic Losby Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b9a254dcc0e7..d508c7844681 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -138,10 +138,14 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) /* * page based offset in vm_pgoff could be sufficiently large to - * overflow a (l)off_t when converted to byte offset. + * overflow a loff_t when converted to byte offset. This can + * only happen on architectures where sizeof(loff_t) == + * sizeof(unsigned long). So, only check in those instances. */ - if (vma->vm_pgoff & PGOFF_LOFFT_MAX) - return -EINVAL; + if (sizeof(unsigned long) == sizeof(loff_t)) { + if (vma->vm_pgoff & PGOFF_LOFFT_MAX) + return -EINVAL; + } /* must be huge page aligned */ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) -- cgit v1.2.3 From 8351760ff5b2042039554b4948ddabaac644a976 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 5 Apr 2018 16:18:25 -0700 Subject: lib: fix stall in __bitmap_parselist() syzbot is catching stalls at __bitmap_parselist() (https://syzkaller.appspot.com/bug?id=ad7e0351fbc90535558514a71cd3edc11681997a). The trigger is unsigned long v = 0; bitmap_parselist("7:,", &v, BITS_PER_LONG); which results in hitting infinite loop at while (a <= b) { off = min(b - a + 1, used_size); bitmap_set(maskp, a, off); a += group_size; } due to used_size == group_size == 0. Link: http://lkml.kernel.org/r/20180404162647.15763-1-ynorov@caviumnetworks.com Fixes: 0a5ce0831d04382a ("lib/bitmap.c: make bitmap_parselist() thread-safe and much faster") Signed-off-by: Yury Norov Reported-by: Tetsuo Handa Reported-by: syzbot Cc: Noam Camus Cc: Rasmus Villemoes Cc: Matthew Wilcox Cc: Mauro Carvalho Chehab Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bitmap.c | 2 +- lib/test_bitmap.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/bitmap.c b/lib/bitmap.c index 9e498c77ed0e..a42eff7e8c48 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -607,7 +607,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, /* if no digit is after '-', it's wrong*/ if (at_start && in_range) return -EINVAL; - if (!(a <= b) || !(used_size <= group_size)) + if (!(a <= b) || group_size == 0 || !(used_size <= group_size)) return -EINVAL; if (b >= nmaskbits) return -ERANGE; diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index b3f235baa05d..413367cf569e 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -255,6 +255,10 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = { {-EINVAL, "-1", NULL, 8, 0}, {-EINVAL, "-0", NULL, 8, 0}, {-EINVAL, "10-1", NULL, 8, 0}, + {-EINVAL, "0-31:", NULL, 8, 0}, + {-EINVAL, "0-31:0", NULL, 8, 0}, + {-EINVAL, "0-31:0/0", NULL, 8, 0}, + {-EINVAL, "0-31:1/0", NULL, 8, 0}, {-EINVAL, "0-31:10/1", NULL, 8, 0}, }; -- cgit v1.2.3 From 6870c0165feaa5e337e78ab2c781ed46f086bca2 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 5 Apr 2018 16:18:29 -0700 Subject: scripts/faddr2line: show the code context Inspired by gdb command 'list', show the code context of target lines. Here is a example: $ scripts/faddr2line vmlinux native_write_msr+0x6 native_write_msr+0x6/0x20: arch_static_branch at arch/x86/include/asm/msr.h:105 100 return EAX_EDX_VAL(val, low, high); 101 } 102 103 static inline void notrace __wrmsr(unsigned int msr, u32 low, u32 high) 104 { 105 asm volatile("1: wrmsr\n" 106 "2:\n" 107 _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) 108 : : "c" (msr), "a"(low), "d" (high) : "memory"); 109 } 110 (inlined by) static_key_false at include/linux/jump_label.h:142 137 #define JUMP_TYPE_LINKED 2UL 138 #define JUMP_TYPE_MASK 3UL 139 140 static __always_inline bool static_key_false(struct static_key *key) 141 { 142 return arch_static_branch(key, false); 143 } 144 145 static __always_inline bool static_key_true(struct static_key *key) 146 { 147 return !arch_static_branch(key, true); (inlined by) native_write_msr at arch/x86/include/asm/msr.h:150 145 static inline void notrace 146 native_write_msr(unsigned int msr, u32 low, u32 high) 147 { 148 __wrmsr(msr, low, high); 149 150 if (msr_tracepoint_active(__tracepoint_write_msr)) 151 do_trace_write_msr(msr, ((u64)high << 32 | low), 0); 152 } 153 154 /* Can be uninlined because referenced by paravirt */ 155 static inline int notrace Link: http://lkml.kernel.org/r/1521444205-2259-1-git-send-email-changbin.du@intel.com Signed-off-by: Changbin Du Cc: Thomas Gleixner Cc: Philippe Ombredanne Cc: NeilBrown Cc: Richard Weinberger Cc: Kate Stewart Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/faddr2line | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/faddr2line b/scripts/faddr2line index 7721d5b2b0c0..9e5735a4d3a5 100755 --- a/scripts/faddr2line +++ b/scripts/faddr2line @@ -163,7 +163,17 @@ __faddr2line() { # pass real address to addr2line echo "$func+$offset/$sym_size:" - ${ADDR2LINE} -fpie $objfile $addr | sed "s; $dir_prefix\(\./\)*; ;" + local file_lines=$(${ADDR2LINE} -fpie $objfile $addr | sed "s; $dir_prefix\(\./\)*; ;") + [[ -z $file_lines ]] && return + + # show each line with context + echo "$file_lines" | while read -r line + do + echo $line + eval $(echo $line | awk -F "[ :]" '{printf("n1=%d;n2=%d;f=%s",$NF-5, $NF+5, $(NF-1))}') + awk 'NR>=strtonum("'$n1'") && NR<=strtonum("'$n2'") {printf("%d\t%s\n", NR, $0)}' $f + done + DONE=1 done < <(${NM} -n $objfile | awk -v fn=$func -v end=$file_end '$3 == fn { found=1; line=$0; start=$1; next } found == 1 { found=0; print line, "0x"$1 } END {if (found == 1) print line, end; }') -- cgit v1.2.3 From 1119d3c06f64a7123d774c363440987952c522ef Mon Sep 17 00:00:00 2001 From: piaojun Date: Thu, 5 Apr 2018 16:18:33 -0700 Subject: ocfs2: use 'osb' instead of 'OCFS2_SB()' We could use 'osb' instead of 'OCFS2_SB()' to make code more elegant. Link: http://lkml.kernel.org/r/5A702111.7090907@huawei.com Signed-off-by: Jun Piao Reviewed-by: Yiwen Jiang Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 2 +- fs/ocfs2/dir.c | 2 +- fs/ocfs2/dlmglue.c | 21 ++++++++------------- fs/ocfs2/file.c | 2 +- fs/ocfs2/inode.c | 6 +++--- fs/ocfs2/refcounttree.c | 4 ++-- fs/ocfs2/suballoc.c | 4 ++-- fs/ocfs2/super.c | 4 ++-- fs/ocfs2/xattr.c | 2 +- 9 files changed, 21 insertions(+), 26 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index e8e205bf2e41..5ff960b7f078 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2213,7 +2213,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, down_write(&oi->ip_alloc_sem); if (first_get_block) { - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + if (ocfs2_sparse_alloc(osb)) ret = ocfs2_zero_tail(inode, di_bh, pos); else ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 977763d4c27d..b048d4fa3959 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3072,7 +3072,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * We need to return the correct block within the * cluster which should hold our entry. */ - off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), + off = ocfs2_dx_dir_hash_idx(osb, &lookup->dl_hinfo); get_bh(dx_leaves[off]); lookup->dl_dx_leaf_bh = dx_leaves[off]; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 9479f99c2145..bc746e821d53 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1756,8 +1756,7 @@ int ocfs2_rw_lock(struct inode *inode, int write) level = write ? DLM_LOCK_EX : DLM_LOCK_PR; - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, - 0); + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); if (status < 0) mlog_errno(status); @@ -1796,7 +1795,7 @@ void ocfs2_rw_unlock(struct inode *inode, int write) write ? "EXMODE" : "PRMODE"); if (!ocfs2_mount_local(osb)) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + ocfs2_cluster_unlock(osb, lockres, level); } /* @@ -1816,8 +1815,7 @@ int ocfs2_open_lock(struct inode *inode) lockres = &OCFS2_I(inode)->ip_open_lockres; - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, - DLM_LOCK_PR, 0, 0); + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_PR, 0, 0); if (status < 0) mlog_errno(status); @@ -1854,8 +1852,7 @@ int ocfs2_try_open_lock(struct inode *inode, int write) * other nodes and the -EAGAIN will indicate to the caller that * this inode is still in use. */ - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, - level, DLM_LKF_NOQUEUE, 0); + status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0); out: return status; @@ -1876,11 +1873,9 @@ void ocfs2_open_unlock(struct inode *inode) goto out; if(lockres->l_ro_holders) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, - DLM_LOCK_PR); + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_PR); if(lockres->l_ex_holders) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, - DLM_LOCK_EX); + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); out: return; @@ -2601,9 +2596,9 @@ void ocfs2_inode_unlock(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, ex ? "EXMODE" : "PRMODE"); - if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + ocfs2_cluster_unlock(osb, lockres, level); } /* diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5d1784a365a3..9d8c3ee05c39 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -296,7 +296,7 @@ int ocfs2_update_inode_atime(struct inode *inode, ocfs2_journal_dirty(handle, bh); out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + ocfs2_commit_trans(osb, handle); out: return ret; } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index d51b80edd972..152f65b9c60e 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1135,7 +1135,7 @@ static void ocfs2_clear_inode(struct inode *inode) trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, inode->i_nlink); - mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, + mlog_bug_on_msg(osb == NULL, "Inode=%lu\n", inode->i_ino); dquot_drop(inode); @@ -1150,7 +1150,7 @@ static void ocfs2_clear_inode(struct inode *inode) ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); - ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, + ocfs2_resv_discard(&osb->osb_la_resmap, &oi->ip_la_data_resv); ocfs2_resv_init_once(&oi->ip_la_data_resv); @@ -1223,7 +1223,7 @@ static void ocfs2_clear_inode(struct inode *inode) * the journal is flushed before journal shutdown. Thus it is safe to * have inodes get cleaned up after journal shutdown. */ - jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, + jbd2_journal_release_jbd_inode(osb->journal->j_journal, &oi->ip_jinode); } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ab156e35ec00..323b162fb401 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3359,7 +3359,7 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) unsigned int ext_flags; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + if (!ocfs2_refcount_tree(osb)) { return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); } @@ -3707,7 +3707,7 @@ int ocfs2_add_refcount_flag(struct inode *inode, trace_ocfs2_add_refcount_flag(ref_blocks, credits); if (ref_blocks) { - ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + ret = ocfs2_reserve_new_metadata_blocks(osb, ref_blocks, &meta_ac); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index d8f5f6ce99dc..40be6474e3d1 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -387,7 +387,7 @@ static int ocfs2_block_group_fill(handle_t *handle, memset(bg, 0, sb->s_blocksize); strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); - bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + bg->bg_generation = cpu_to_le32(osb->fs_generation); bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, osb->s_feature_incompat)); bg->bg_chain = cpu_to_le16(my_chain); @@ -1521,7 +1521,7 @@ static int ocfs2_cluster_group_search(struct inode *inode, OCFS2_I(inode)->ip_clusters, max_bits); } - ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), + ret = ocfs2_block_group_find_clear_bits(osb, group_bh, bits_wanted, max_bits, res); if (ret) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ffa4952d432b..e4d54d2805d6 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -423,10 +423,10 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait) ocfs2_schedule_truncate_log_flush(osb, 0); } - if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, + if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { if (wait) - jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, + jbd2_log_wait_commit(osb->journal->j_journal, target); } return 0; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c261c1dfd374..3a24ce3deb01 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3564,7 +3564,7 @@ int ocfs2_xattr_set(struct inode *inode, .not_found = -ENODATA, }; - if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + if (!ocfs2_supports_xattr(osb)) return -EOPNOTSUPP; /* -- cgit v1.2.3 From d324cd4c80b9cbe1c2ac5285f071387c79bea455 Mon Sep 17 00:00:00 2001 From: piaojun Date: Thu, 5 Apr 2018 16:18:37 -0700 Subject: ocfs2: use 'oi' instead of 'OCFS2_I()' We could use 'oi' instead of 'OCFS2_I()' to make code more elegant. Link: http://lkml.kernel.org/r/5A7020FE.5050906@huawei.com Signed-off-by: Jun Piao Reviewed-by: Yiwen Jiang Reviewed-by: Alex Chen Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/aops.c | 2 +- fs/ocfs2/file.c | 6 +++--- fs/ocfs2/inode.c | 2 +- fs/ocfs2/namei.c | 6 +++--- fs/ocfs2/refcounttree.c | 6 +++--- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9a876bb07cac..0f157bbd3e0f 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7119,7 +7119,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, goto out_commit; did_quota = 1; - data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + data_ac->ac_resv = &oi->ip_la_data_resv; ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &num); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 5ff960b7f078..302cd7caa4a7 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -346,7 +346,7 @@ static int ocfs2_readpage(struct file *file, struct page *page) unlock = 0; out_alloc: - up_read(&OCFS2_I(inode)->ip_alloc_sem); + up_read(&oi->ip_alloc_sem); out_inode_unlock: ocfs2_inode_unlock(inode, 0); out: diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9d8c3ee05c39..164fd0940af4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -101,7 +101,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) struct ocfs2_inode_info *oi = OCFS2_I(inode); trace_ocfs2_file_open(inode, file, file->f_path.dentry, - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)oi->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, mode); @@ -116,7 +116,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) /* Check that the inode hasn't been wiped from disk by another * node. If it hasn't then we're safe as long as we hold the * spin lock until our increment of open count. */ - if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + if (oi->ip_flags & OCFS2_INODE_DELETED) { spin_unlock(&oi->ip_lock); status = -ENOENT; @@ -190,7 +190,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, bool needs_barrier = false; trace_ocfs2_sync_file(inode, file, file->f_path.dentry, - OCFS2_I(inode)->ip_blkno, + oi->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, (unsigned long long)datasync); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 152f65b9c60e..ddc3e9470c87 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1160,7 +1160,7 @@ static void ocfs2_clear_inode(struct inode *inode) * exception here are successfully wiped inodes - their * metadata can now be considered to be part of the system * inodes from which it came. */ - if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) + if (!(oi->ip_flags & OCFS2_INODE_DELETED)) ocfs2_checkpoint_inode(inode); mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index c801eddc4bf3..8dd6f703c819 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -525,7 +525,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, * these are used by the support functions here and in * callers. */ inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); - OCFS2_I(inode)->ip_blkno = fe_blkno; + oi->ip_blkno = fe_blkno; spin_lock(&osb->osb_lock); inode->i_generation = osb->s_next_generation++; spin_unlock(&osb->osb_lock); @@ -1186,8 +1186,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, } trace_ocfs2_double_lock_end( - (unsigned long long)OCFS2_I(inode1)->ip_blkno, - (unsigned long long)OCFS2_I(inode2)->ip_blkno); + (unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); bail: if (status) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 323b162fb401..01c6b3894406 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -573,7 +573,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, BUG_ON(ocfs2_is_refcount_inode(inode)); trace_ocfs2_create_refcount_tree( - (unsigned long long)OCFS2_I(inode)->ip_blkno); + (unsigned long long)oi->ip_blkno); ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); if (ret) { @@ -4766,8 +4766,8 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode, *bh2 = *bh1; trace_ocfs2_double_lock_end( - (unsigned long long)OCFS2_I(inode1)->ip_blkno, - (unsigned long long)OCFS2_I(inode2)->ip_blkno); + (unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); return 0; -- cgit v1.2.3 From bb4c9d67650efb02bd8c2b0ea077d421d54cbc4c Mon Sep 17 00:00:00 2001 From: piaojun Date: Thu, 5 Apr 2018 16:18:41 -0700 Subject: ocfs2: remove some unused function declarations Remove some unused function declarations in dlmcommon.h. Link: http://lkml.kernel.org/r/5A7D1034.7050807@huawei.com Signed-off-by: Jun Piao Reviewed-by: Yiwen Jiang Reviewed-by: Changwei Ge Reviewed-by: Alex Chen Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmcommon.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e9f3705c4c9f..953c200e1c30 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -960,13 +960,10 @@ static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm, void dlm_print_one_lock_resource(struct dlm_lock_resource *res); void __dlm_print_one_lock_resource(struct dlm_lock_resource *res); -u8 dlm_nm_this_node(struct dlm_ctxt *dlm); void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); -int dlm_nm_init(struct dlm_ctxt *dlm); -int dlm_heartbeat_init(struct dlm_ctxt *dlm); void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); -- cgit v1.2.3 From 1202d4ba2899e083417b7b4fb9b544942fd2b9b7 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Thu, 5 Apr 2018 16:18:45 -0700 Subject: ocfs2: keep the trace point consistent with the function name Keep the trace point consistent with the function name. Link: http://lkml.kernel.org/r/02609aba-84b2-a22d-3f3b-bc1944b94260@huawei.com Fixes: 3ef045c3d8ae ("ocfs2: switch to ->write_iter()") Signed-off-by: Jia Guo Reviewed-by: Jun Piao Reviewed-by: Yiwen Jiang Reviewed-by: Alex Chen Acked-by: Gang He Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.h | 2 +- fs/ocfs2/file.c | 8 ++++---- fs/ocfs2/ocfs2_trace.h | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 8614ff069d99..3494a62ed749 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -78,7 +78,7 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) /* * Using a named enum representing lock types in terms of #N bit stored in * iocb->private, which is going to be used for communication between - * ocfs2_dio_end_io() and ocfs2_file_aio_write/read(). + * ocfs2_dio_end_io() and ocfs2_file_write/read_iter(). */ enum ocfs2_iocb_lock_bits { OCFS2_IOCB_RW_LOCK = 0, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 164fd0940af4..6ee94bc23f5b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2257,7 +2257,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; - trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, + trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, @@ -2405,7 +2405,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; - trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, + trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, filp->f_path.dentry->d_name.len, filp->f_path.dentry->d_name.name, @@ -2448,7 +2448,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, * * Take and drop the meta data lock to update inode fields * like i_size. This allows the checks down below - * generic_file_aio_read() a chance of actually working. + * generic_file_read_iter() a chance of actually working. */ ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level, !nowait); @@ -2460,7 +2460,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, ocfs2_inode_unlock(inode, lock_level); ret = generic_file_read_iter(iocb, to); - trace_generic_file_aio_read_ret(ret); + trace_generic_file_read_iter_ret(ret); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index e2a11aaece10..2ee76a90ba8f 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1311,11 +1311,11 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_file_release); DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write); +DEFINE_OCFS2_FILE_OPS(ocfs2_file_write_iter); DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read); +DEFINE_OCFS2_FILE_OPS(ocfs2_file_read_iter); DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); @@ -1467,7 +1467,7 @@ TRACE_EVENT(ocfs2_prepare_inode_for_write, __entry->saved_pos, __entry->count, __entry->wait) ); -DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); +DEFINE_OCFS2_INT_EVENT(generic_file_read_iter_ret); /* End of trace events for fs/ocfs2/file.c. */ -- cgit v1.2.3 From bb34f24c7d2c98d0c81838a7700e6068325b17a0 Mon Sep 17 00:00:00 2001 From: Jun Piao Date: Thu, 5 Apr 2018 16:18:48 -0700 Subject: ocfs2/dlm: don't handle migrate lockres if already in shutdown We should not handle migrate lockres if we are already in 'DLM_CTXT_IN_SHUTDOWN', as that will cause lockres remains after leaving dlm domain. At last other nodes will get stuck into infinite loop when requsting lock from us. The problem is caused by concurrency umount between nodes. Before receiveing N1's DLM_BEGIN_EXIT_DOMAIN_MSG, N2 has picked up N1 as the migrate target. So N2 will continue sending lockres to N1 even though N1 has left domain. N1 N2 (owner) touch file access the file, and get pr lock begin leave domain and pick up N1 as new owner begin leave domain and migrate all lockres done begin migrate lockres to N1 end leave domain, but the lockres left unexpectedly, because migrate task has passed [piaojun@huawei.com: v3] Link: http://lkml.kernel.org/r/5A9CBD19.5020107@huawei.com Link: http://lkml.kernel.org/r/5A99F028.2090902@huawei.com Signed-off-by: Jun Piao Reviewed-by: Yiwen Jiang Reviewed-by: Joseph Qi Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdomain.c | 14 -------------- fs/ocfs2/dlm/dlmdomain.h | 25 ++++++++++++++++++++++++- fs/ocfs2/dlm/dlmrecovery.c | 9 +++++++++ 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index e1fea149f50b..25b76f0d082b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -675,20 +675,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm) spin_unlock(&dlm->spinlock); } -int dlm_shutting_down(struct dlm_ctxt *dlm) -{ - int ret = 0; - - spin_lock(&dlm_domain_lock); - - if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) - ret = 1; - - spin_unlock(&dlm_domain_lock); - - return ret; -} - void dlm_unregister_domain(struct dlm_ctxt *dlm) { int leave = 0; diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h index fd6122a38dbd..8a9281411c18 100644 --- a/fs/ocfs2/dlm/dlmdomain.h +++ b/fs/ocfs2/dlm/dlmdomain.h @@ -28,7 +28,30 @@ extern spinlock_t dlm_domain_lock; extern struct list_head dlm_domains; -int dlm_shutting_down(struct dlm_ctxt *dlm); +static inline int dlm_joined(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + if (dlm->dlm_state == DLM_CTXT_JOINED) + ret = 1; + spin_unlock(&dlm_domain_lock); + + return ret; +} + +static inline int dlm_shutting_down(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) + ret = 1; + spin_unlock(&dlm_domain_lock); + + return ret; +} + void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, int node_num); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index ec8f75813beb..505ab4281f36 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1378,6 +1378,15 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, if (!dlm_grab(dlm)) return -EINVAL; + if (!dlm_joined(dlm)) { + mlog(ML_ERROR, "Domain %s not joined! " + "lockres %.*s, master %u\n", + dlm->name, mres->lockname_len, + mres->lockname, mres->master); + dlm_put(dlm); + return -EINVAL; + } + BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); real_master = mres->master; -- cgit v1.2.3 From a17b485aae3effe2f35345dc7af4ed2070a2d949 Mon Sep 17 00:00:00 2001 From: piaojun Date: Thu, 5 Apr 2018 16:18:52 -0700 Subject: ocfs2: remove unnecessary null pointer check before kmem_cache_destroy() As kmem_cache_destroy() already handles null pointers, so we can remove the conditional test entirely. Link: http://lkml.kernel.org/r/5A9EB21D.3000209@huawei.com Signed-off-by: Jun Piao Reviewed-by: Yiwen Jiang Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmlock.c | 3 +-- fs/ocfs2/dlm/dlmmaster.c | 15 +++++---------- fs/ocfs2/super.c | 18 ++++++------------ fs/ocfs2/uptodate.c | 3 +-- 4 files changed, 13 insertions(+), 26 deletions(-) diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 66c2a491f68d..74962315794e 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -77,8 +77,7 @@ int dlm_init_lock_cache(void) void dlm_destroy_lock_cache(void) { - if (dlm_lock_cache) - kmem_cache_destroy(dlm_lock_cache); + kmem_cache_destroy(dlm_lock_cache); } /* Tell us whether we can grant a new lock request. diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a7df226f9449..03766fb41dbe 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -414,8 +414,7 @@ int dlm_init_mle_cache(void) void dlm_destroy_mle_cache(void) { - if (dlm_mle_cache) - kmem_cache_destroy(dlm_mle_cache); + kmem_cache_destroy(dlm_mle_cache); } static void dlm_mle_release(struct kref *kref) @@ -472,15 +471,11 @@ bail: void dlm_destroy_master_caches(void) { - if (dlm_lockname_cache) { - kmem_cache_destroy(dlm_lockname_cache); - dlm_lockname_cache = NULL; - } + kmem_cache_destroy(dlm_lockname_cache); + dlm_lockname_cache = NULL; - if (dlm_lockres_cache) { - kmem_cache_destroy(dlm_lockres_cache); - dlm_lockres_cache = NULL; - } + kmem_cache_destroy(dlm_lockres_cache); + dlm_lockres_cache = NULL; } static void dlm_lockres_release(struct kref *kref) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index e4d54d2805d6..a49b7c2bde9e 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1768,12 +1768,9 @@ static int ocfs2_initialize_mem_caches(void) NULL); if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || !ocfs2_qf_chunk_cachep) { - if (ocfs2_inode_cachep) - kmem_cache_destroy(ocfs2_inode_cachep); - if (ocfs2_dquot_cachep) - kmem_cache_destroy(ocfs2_dquot_cachep); - if (ocfs2_qf_chunk_cachep) - kmem_cache_destroy(ocfs2_qf_chunk_cachep); + kmem_cache_destroy(ocfs2_inode_cachep); + kmem_cache_destroy(ocfs2_dquot_cachep); + kmem_cache_destroy(ocfs2_qf_chunk_cachep); return -ENOMEM; } @@ -1787,16 +1784,13 @@ static void ocfs2_free_mem_caches(void) * destroy cache. */ rcu_barrier(); - if (ocfs2_inode_cachep) - kmem_cache_destroy(ocfs2_inode_cachep); + kmem_cache_destroy(ocfs2_inode_cachep); ocfs2_inode_cachep = NULL; - if (ocfs2_dquot_cachep) - kmem_cache_destroy(ocfs2_dquot_cachep); + kmem_cache_destroy(ocfs2_dquot_cachep); ocfs2_dquot_cachep = NULL; - if (ocfs2_qf_chunk_cachep) - kmem_cache_destroy(ocfs2_qf_chunk_cachep); + kmem_cache_destroy(ocfs2_qf_chunk_cachep); ocfs2_qf_chunk_cachep = NULL; } diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 82e17b076ce7..78f09c76ab3c 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -633,6 +633,5 @@ int __init init_ocfs2_uptodate_cache(void) void exit_ocfs2_uptodate_cache(void) { - if (ocfs2_uptodate_cachep) - kmem_cache_destroy(ocfs2_uptodate_cachep); + kmem_cache_destroy(ocfs2_uptodate_cachep); } -- cgit v1.2.3 From a012ab4d4316e6a6da8d93e96c74c51b59965dc3 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Thu, 5 Apr 2018 16:18:56 -0700 Subject: ocfs2: remove two unused functions from suballoc.c The two functions are no longer used. Link: http://lkml.kernel.org/r/1519609595-26229-1-git-send-email-ge.changwei@h3c.com Signed-off-by: Changwei Ge Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 49 ------------------------------------------------- 1 file changed, 49 deletions(-) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 40be6474e3d1..f7c972fbed6a 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -79,8 +79,6 @@ static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); } -static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); -static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); static int ocfs2_block_group_fill(handle_t *handle, struct inode *alloc_inode, @@ -2626,53 +2624,6 @@ int ocfs2_release_clusters(handle_t *handle, _ocfs2_clear_bit); } -static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) -{ - printk("Block Group:\n"); - printk("bg_signature: %s\n", bg->bg_signature); - printk("bg_size: %u\n", bg->bg_size); - printk("bg_bits: %u\n", bg->bg_bits); - printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); - printk("bg_chain: %u\n", bg->bg_chain); - printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); - printk("bg_next_group: %llu\n", - (unsigned long long)bg->bg_next_group); - printk("bg_parent_dinode: %llu\n", - (unsigned long long)bg->bg_parent_dinode); - printk("bg_blkno: %llu\n", - (unsigned long long)bg->bg_blkno); -} - -static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) -{ - int i; - - printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno); - printk("i_signature: %s\n", fe->i_signature); - printk("i_size: %llu\n", - (unsigned long long)fe->i_size); - printk("i_clusters: %u\n", fe->i_clusters); - printk("i_generation: %u\n", - le32_to_cpu(fe->i_generation)); - printk("id1.bitmap1.i_used: %u\n", - le32_to_cpu(fe->id1.bitmap1.i_used)); - printk("id1.bitmap1.i_total: %u\n", - le32_to_cpu(fe->id1.bitmap1.i_total)); - printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); - printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); - printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); - printk("id2.i_chain.cl_next_free_rec: %u\n", - fe->id2.i_chain.cl_next_free_rec); - for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { - printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, - fe->id2.i_chain.cl_recs[i].c_free); - printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, - fe->id2.i_chain.cl_recs[i].c_total); - printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i, - (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); - } -} - /* * For a given allocation, determine which allocators will need to be * accessed, and lock them, reserving the appropriate number of bits. -- cgit v1.2.3 From 26ec1615ca82e00b583aab92a0d88d9c392932e3 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Thu, 5 Apr 2018 16:19:00 -0700 Subject: fs/ocfs2/dlm/dlmrecovery.c: remove unrelated comment Obviously, the comment before dlm_do_local_recovery_cleanup() has nothing to do with it. So remove it. Link: http://lkml.kernel.org/r/1519371054-4648-1-git-send-email-ge.changwei@h3c.com Signed-off-by: Changwei Ge Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 505ab4281f36..ebff63b9becf 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2340,13 +2340,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, __dlm_dirty_lockres(dlm, res); } -/* if this node is the recovery master, and there are no - * locks for a given lockres owned by this node that are in - * either PR or EX mode, zero out the lvb before requesting. - * - */ - - static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) { struct dlm_lock_resource *res; -- cgit v1.2.3 From 2bcb654c93a5a9d741d964c3d399b9e8e9157e8c Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Thu, 5 Apr 2018 16:19:03 -0700 Subject: ocfs2/dlm: clean up unused argument for dlm_destroy_recovery_area() Link: http://lkml.kernel.org/r/1521116681-14602-1-git-send-email-ge.changwei@h3c.com Signed-off-by: Changwei Ge Cc: Mark Fasheh Cc: Joseph Qi Cc: Junxiao Bi Cc: Joel Becker Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index ebff63b9becf..86204b81ef34 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -62,7 +62,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node); static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, u8 dead_node); -static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm); static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res); static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, @@ -739,7 +739,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) } if (destroy) - dlm_destroy_recovery_area(dlm, dead_node); + dlm_destroy_recovery_area(dlm); return status; } @@ -764,7 +764,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) ndata = kzalloc(sizeof(*ndata), GFP_NOFS); if (!ndata) { - dlm_destroy_recovery_area(dlm, dead_node); + dlm_destroy_recovery_area(dlm); return -ENOMEM; } ndata->node_num = num; @@ -778,7 +778,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) return 0; } -static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm) { struct dlm_reco_node_data *ndata, *next; LIST_HEAD(tmplist); -- cgit v1.2.3 From a43d24cb3b0b395de8bb176355a907a9c9a1c42e Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Thu, 5 Apr 2018 16:19:07 -0700 Subject: ocfs2/dlm: clean up unused stack variable in dlm_do_local_ast() Link: http://lkml.kernel.org/r/1521116681-14602-2-git-send-email-ge.changwei@h3c.com Signed-off-by: Changwei Ge Cc: Mark Fasheh Cc: Joseph Qi Cc: Junxiao Bi Cc: Joel Becker Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmast.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index fd6bbbbd7d78..39831fc2fd52 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -224,14 +224,12 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock) { dlm_astlockfunc_t *fn; - struct dlm_lockstatus *lksb; mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name, res->lockname.len, res->lockname.name, dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); - lksb = lock->lksb; fn = lock->ast; BUG_ON(lock->ml.node != dlm->node_num); -- cgit v1.2.3 From 60c7ec9ee4a3410c2cb08850102d363c7e207f48 Mon Sep 17 00:00:00 2001 From: piaojun Date: Thu, 5 Apr 2018 16:19:11 -0700 Subject: ocfs2/dlm: wait for dlm recovery done when migrating all lock resources Wait for dlm recovery done when migrating all lock resources in case that new lock resource left after leaving dlm domain. And the left lock resource will cause other nodes BUG. NodeA NodeB NodeC umount: dlm_unregister_domain() dlm_migrate_all_locks() NodeB down do recovery for NodeB and collect a new lockres form other live nodes: dlm_do_recovery dlm_remaster_locks dlm_request_all_locks: dlm_mig_lockres_handler dlm_new_lockres __dlm_insert_lockres at last NodeA become the master of the new lockres and leave domain: dlm_leave_domain() mount: dlm_join_domain() touch file and request for the owner of the new lockres, but all the other nodes said 'NO', so NodeC decide to be the owner, and send do assert msg to other nodes: dlmlock() dlm_get_lock_resource() dlm_do_assert_master() other nodes receive the msg and found two masters exist. at last cause BUG in dlm_assert_master_handler() -->BUG(); Link: http://lkml.kernel.org/r/5AAA6E25.7090303@huawei.com Fixes: bc9838c4d44a ("dlm: allow dlm do recovery during shutdown") Signed-off-by: Jun Piao Reviewed-by: Alex Chen Reviewed-by: Yiwen Jiang Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmcommon.h | 1 + fs/ocfs2/dlm/dlmdomain.c | 15 +++++++++++++++ fs/ocfs2/dlm/dlmrecovery.c | 13 ++++++++++--- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 953c200e1c30..d06e27ec4be4 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -140,6 +140,7 @@ struct dlm_ctxt u8 node_num; u32 key; u8 joining_node; + u8 migrate_done; /* set to 1 means node has migrated all lock resources */ wait_queue_head_t dlm_join_events; unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 25b76f0d082b..425081be6161 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -461,6 +461,19 @@ redo_bucket: cond_resched_lock(&dlm->spinlock); num += n; } + + if (!num) { + if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(0, "%s: perhaps there are more lock resources " + "need to be migrated after dlm recovery\n", dlm->name); + ret = -EAGAIN; + } else { + mlog(0, "%s: we won't do dlm recovery after migrating " + "all lock resources\n", dlm->name); + dlm->migrate_done = 1; + } + } + spin_unlock(&dlm->spinlock); wake_up(&dlm->dlm_thread_wq); @@ -2038,6 +2051,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; init_waitqueue_head(&dlm->dlm_join_events); + dlm->migrate_done = 0; + dlm->reco.new_master = O2NM_INVALID_NODE_NUM; dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 86204b81ef34..b454eb371b77 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm) static void dlm_begin_recovery(struct dlm_ctxt *dlm) { - spin_lock(&dlm->spinlock); + assert_spin_locked(&dlm->spinlock); BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", dlm->name, dlm->reco.dead_node); dlm->reco.state |= DLM_RECO_STATE_ACTIVE; - spin_unlock(&dlm->spinlock); } static void dlm_end_recovery(struct dlm_ctxt *dlm) @@ -456,6 +455,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) spin_lock(&dlm->spinlock); + if (dlm->migrate_done) { + mlog(0, "%s: no need do recovery after migrating all " + "lock resources\n", dlm->name); + spin_unlock(&dlm->spinlock); + return 0; + } + /* check to see if the new master has died */ if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM && test_bit(dlm->reco.new_master, dlm->recovery_map)) { @@ -490,12 +496,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.dead_node); - spin_unlock(&dlm->spinlock); /* take write barrier */ /* (stops the list reshuffling thread, proxy ast handling) */ dlm_begin_recovery(dlm); + spin_unlock(&dlm->spinlock); + if (dlm->reco.new_master == dlm->node_num) goto master_here; -- cgit v1.2.3 From 510c48795d4b48963d4d8a889bb8650a11e3f440 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 5 Apr 2018 16:19:14 -0700 Subject: ocfs2: fix spelling mistake: "Migrateable" -> "Migratable" Trivial fix to spelling mistake in mlog message text Link: http://lkml.kernel.org/r/20180319114101.2051-1-colin.king@canonical.com Signed-off-by: Colin Ian King Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 03766fb41dbe..81d500211f43 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2543,7 +2543,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, return 0; } - mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len, + mlog(0, "%s: res %.*s, Migratable\n", dlm->name, res->lockname.len, res->lockname.name); return 1; -- cgit v1.2.3 From baa31b89b00236ea459502bff490aef1a0d29562 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Thu, 5 Apr 2018 16:19:18 -0700 Subject: ocfs2: correct spelling mistake for migratable for all Inspired by the ocfs2 patch to fix the spelling of migrateable to migratable, I checked all ocfs2 files and found more spelling mistakes. So correct them all. Link: http://lkml.kernel.org/r/1521525734-19576-1-git-send-email-ge.changwei@h3c.com Signed-off-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 81d500211f43..aaca0949fe53 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2490,13 +2490,13 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) } /* - * A migrateable resource is one that is : + * A migratable resource is one that is : * 1. locally mastered, and, * 2. zero local locks, and, * 3. one or more non-local locks, or, one or more references * Returns 1 if yes, 0 if not. */ -static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, +static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { enum dlm_lockres_list idx; @@ -2527,7 +2527,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, continue; } cookie = be64_to_cpu(lock->ml.cookie); - mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on " + mlog(0, "%s: Not migratable res %.*s, lock %u:%llu on " "%s list\n", dlm->name, res->lockname.len, res->lockname.name, dlm_get_lock_cookie_node(cookie), @@ -2787,7 +2787,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&dlm->spinlock); spin_lock(&res->spinlock); - if (dlm_is_lockres_migrateable(dlm, res)) + if (dlm_is_lockres_migratable(dlm, res)) target = dlm_pick_migration_target(dlm, res); spin_unlock(&res->spinlock); -- cgit v1.2.3 From 5ac94386194eccc2125c17d1b75aa8364584f585 Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 5 Apr 2018 16:19:22 -0700 Subject: ocfs2: move some definitions to header file Patch series "ocfs2: use kobject for online file check", v3. Use embedded kobject mechanism for online file check feature, this will avoid to use a global list to save/search per-device online file check related data. The changed code is based on Goldwyn Rodrigues's patches and ext4 fs code, there is not any new features added, except some very small fixes during this code refactoring. Second, the code change does not affect the underlying file check code. Thank Goldwyn very much. Compare with second version, add more comments in the patch descriptions, to make sure each modification is mentioned. Compare with first version, split the code change into four patches, make sure each patch will not bring ocfs2 kernel modules compiling errors. This patch (of 3): Move some definitions to header file, which will be referenced by other source files when kobject mechanism is introduced. Link: http://lkml.kernel.org/r/1495611866-27360-2-git-send-email-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/filecheck.c | 27 --------------------------- fs/ocfs2/filecheck.h | 27 +++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 6b92cb241138..68f6c35dce36 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -56,33 +56,6 @@ static const char * const ocfs2_filecheck_errs[] = { static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); static LIST_HEAD(ocfs2_filecheck_sysfs_list); -struct ocfs2_filecheck { - struct list_head fc_head; /* File check entry list head */ - spinlock_t fc_lock; - unsigned int fc_max; /* Maximum number of entry in list */ - unsigned int fc_size; /* Current entry count in list */ - unsigned int fc_done; /* Finished entry count in list */ -}; - -struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ - struct list_head fs_list; - atomic_t fs_count; - struct super_block *fs_sb; - struct kset *fs_devicekset; - struct kset *fs_fcheckkset; - struct ocfs2_filecheck *fs_fcheck; -}; - -#define OCFS2_FILECHECK_MAXSIZE 100 -#define OCFS2_FILECHECK_MINSIZE 10 - -/* File check operation type */ -enum { - OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ - OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ - OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ -}; - struct ocfs2_filecheck_entry { struct list_head fe_list; unsigned long fe_ino; diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h index e5cd002a2c09..af1678b620a4 100644 --- a/fs/ocfs2/filecheck.h +++ b/fs/ocfs2/filecheck.h @@ -43,6 +43,33 @@ enum { #define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED #define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED +struct ocfs2_filecheck { + struct list_head fc_head; /* File check entry list head */ + spinlock_t fc_lock; + unsigned int fc_max; /* Maximum number of entry in list */ + unsigned int fc_size; /* Current entry count in list */ + unsigned int fc_done; /* Finished entry count in list */ +}; + +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ + struct list_head fs_list; + atomic_t fs_count; + struct super_block *fs_sb; + struct kset *fs_devicekset; + struct kset *fs_fcheckkset; + struct ocfs2_filecheck *fs_fcheck; +}; + +#define OCFS2_FILECHECK_MAXSIZE 100 +#define OCFS2_FILECHECK_MINSIZE 10 + +/* File check operation type */ +enum { + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ + OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ +}; + int ocfs2_filecheck_create_sysfs(struct super_block *sb); int ocfs2_filecheck_remove_sysfs(struct super_block *sb); -- cgit v1.2.3 From 8fc2cb4ba03ee3502ec23de825cd133f5388738f Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 5 Apr 2018 16:19:25 -0700 Subject: ocfs2: fix some small problems First, move setting fe_done = 1 in spin lock, avoid bring any potential race condition. Second, tune mlog message level from ERROR to NOTICE, since the message should not belong to error message. Third, tune errno to -EAGAIN when file check queue is full, this errno is more appropriate in the case. Link: http://lkml.kernel.org/r/1495611866-27360-3-git-send-email-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/filecheck.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 68f6c35dce36..6014ebda00da 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -283,7 +283,7 @@ ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent, spin_lock(&ent->fs_fcheck->fc_lock); if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) { - mlog(ML_ERROR, + mlog(ML_NOTICE, "Cannot set online file check maximum entry number " "to %u due to too many pending entries(%u)\n", len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done); @@ -457,8 +457,8 @@ static void ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, struct ocfs2_filecheck_entry *entry) { - entry->fe_done = 1; spin_lock(&ent->fs_fcheck->fc_lock); + entry->fe_done = 1; ent->fs_fcheck->fc_done++; spin_unlock(&ent->fs_fcheck->fc_lock); } @@ -540,11 +540,11 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, spin_lock(&ent->fs_fcheck->fc_lock); if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && (ent->fs_fcheck->fc_done == 0)) { - mlog(ML_ERROR, + mlog(ML_NOTICE, "Cannot do more file check " "since file check queue(%u) is full now\n", ent->fs_fcheck->fc_max); - ret = -EBUSY; + ret = -EAGAIN; kfree(entry); } else { if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && -- cgit v1.2.3 From 5f483c4abb507d068b326f24e4ea3481db9cda06 Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 5 Apr 2018 16:19:29 -0700 Subject: ocfs2: add kobject for online file check Use embedded kobject mechanism for online file check feature, this will avoid to use a global list to save/search per-device online file check related data, meanwhile, reduce the code lines and make the code logic clear. The changed code is based on Goldwyn Rodrigues's patches and ext4 fs code. Link: http://lkml.kernel.org/r/1495611866-27360-4-git-send-email-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/filecheck.c | 302 ++++++++++++++++++++------------------------------- fs/ocfs2/filecheck.h | 20 ++-- fs/ocfs2/ocfs2.h | 8 ++ fs/ocfs2/super.c | 27 ++++- 4 files changed, 155 insertions(+), 202 deletions(-) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 6014ebda00da..a94c5310a59a 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -53,9 +53,6 @@ static const char * const ocfs2_filecheck_errs[] = { "UNSUPPORTED" }; -static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); -static LIST_HEAD(ocfs2_filecheck_sysfs_list); - struct ocfs2_filecheck_entry { struct list_head fe_list; unsigned long fe_ino; @@ -83,35 +80,84 @@ ocfs2_filecheck_error(int errno) return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; } -static ssize_t ocfs2_filecheck_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf); -static ssize_t ocfs2_filecheck_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count); -static struct kobj_attribute ocfs2_attr_filecheck_chk = +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +static struct kobj_attribute ocfs2_filecheck_attr_chk = __ATTR(check, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); -static struct kobj_attribute ocfs2_attr_filecheck_fix = + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_fix = __ATTR(fix, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); -static struct kobj_attribute ocfs2_attr_filecheck_set = + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_set = __ATTR(set, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct attribute *ocfs2_filecheck_attrs[] = { + &ocfs2_filecheck_attr_chk.attr, + &ocfs2_filecheck_attr_fix.attr, + &ocfs2_filecheck_attr_set.attr, + NULL +}; + +static void ocfs2_filecheck_release(struct kobject *kobj) +{ + struct ocfs2_filecheck_sysfs_entry *entry = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); + + complete(&entry->fs_kobj_unregister); +} + +static ssize_t +ocfs2_filecheck_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->show) + ret = kattr->show(kobj, kattr, buf); + kobject_put(kobj); + return ret; +} + +static ssize_t +ocfs2_filecheck_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->store) + ret = kattr->store(kobj, kattr, buf, count); + kobject_put(kobj); + return ret; +} + +static const struct sysfs_ops ocfs2_filecheck_ops = { + .show = ocfs2_filecheck_show, + .store = ocfs2_filecheck_store, +}; + +static struct kobj_type ocfs2_ktype_filecheck = { + .default_attrs = ocfs2_filecheck_attrs, + .sysfs_ops = &ocfs2_filecheck_ops, + .release = ocfs2_filecheck_release, +}; static void ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) { struct ocfs2_filecheck_entry *p; - if (!atomic_dec_and_test(&entry->fs_count)) { - wait_var_event(&entry->fs_count, - !atomic_read(&entry->fs_count)); - } - spin_lock(&entry->fs_fcheck->fc_lock); while (!list_empty(&entry->fs_fcheck->fc_head)) { p = list_first_entry(&entry->fs_fcheck->fc_head, @@ -122,151 +168,48 @@ ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) } spin_unlock(&entry->fs_fcheck->fc_lock); - kset_unregister(entry->fs_fcheckkset); - kset_unregister(entry->fs_devicekset); kfree(entry->fs_fcheck); - kfree(entry); -} - -static void -ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry) -{ - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list); - spin_unlock(&ocfs2_filecheck_sysfs_lock); -} - -static int ocfs2_filecheck_sysfs_del(const char *devname) -{ - struct ocfs2_filecheck_sysfs_entry *p; - - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { - if (!strcmp(p->fs_sb->s_id, devname)) { - list_del(&p->fs_list); - spin_unlock(&ocfs2_filecheck_sysfs_lock); - ocfs2_filecheck_sysfs_free(p); - return 0; - } - } - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return 1; -} - -static void -ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry) -{ - if (atomic_dec_and_test(&entry->fs_count)) - wake_up_var(&entry->fs_count); -} - -static struct ocfs2_filecheck_sysfs_entry * -ocfs2_filecheck_sysfs_get(const char *devname) -{ - struct ocfs2_filecheck_sysfs_entry *p = NULL; - - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { - if (!strcmp(p->fs_sb->s_id, devname)) { - atomic_inc(&p->fs_count); - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return p; - } - } - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return NULL; + entry->fs_fcheck = NULL; } -int ocfs2_filecheck_create_sysfs(struct super_block *sb) +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb) { - int ret = 0; - struct kset *device_kset = NULL; - struct kset *fcheck_kset = NULL; - struct ocfs2_filecheck *fcheck = NULL; - struct ocfs2_filecheck_sysfs_entry *entry = NULL; - struct attribute **attrs = NULL; - struct attribute_group attrgp; - - if (!ocfs2_kset) - return -ENOMEM; - - attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS); - if (!attrs) { - ret = -ENOMEM; - goto error; - } else { - attrs[0] = &ocfs2_attr_filecheck_chk.attr; - attrs[1] = &ocfs2_attr_filecheck_fix.attr; - attrs[2] = &ocfs2_attr_filecheck_set.attr; - attrs[3] = NULL; - memset(&attrgp, 0, sizeof(attrgp)); - attrgp.attrs = attrs; - } + int ret; + struct ocfs2_filecheck *fcheck; + struct ocfs2_filecheck_sysfs_entry *entry = &osb->osb_fc_ent; fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS); - if (!fcheck) { - ret = -ENOMEM; - goto error; - } else { - INIT_LIST_HEAD(&fcheck->fc_head); - spin_lock_init(&fcheck->fc_lock); - fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; - fcheck->fc_size = 0; - fcheck->fc_done = 0; - } - - if (strlen(sb->s_id) <= 0) { - mlog(ML_ERROR, - "Cannot get device basename when create filecheck sysfs\n"); - ret = -ENODEV; - goto error; - } - - device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); - if (!device_kset) { - ret = -ENOMEM; - goto error; - } - - fcheck_kset = kset_create_and_add("filecheck", NULL, - &device_kset->kobj); - if (!fcheck_kset) { - ret = -ENOMEM; - goto error; - } - - ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp); - if (ret) - goto error; + if (!fcheck) + return -ENOMEM; - entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS); - if (!entry) { - ret = -ENOMEM; - goto error; - } else { - atomic_set(&entry->fs_count, 1); - entry->fs_sb = sb; - entry->fs_devicekset = device_kset; - entry->fs_fcheckkset = fcheck_kset; - entry->fs_fcheck = fcheck; - ocfs2_filecheck_sysfs_add(entry); + INIT_LIST_HEAD(&fcheck->fc_head); + spin_lock_init(&fcheck->fc_lock); + fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; + fcheck->fc_size = 0; + fcheck->fc_done = 0; + + entry->fs_kobj.kset = osb->osb_dev_kset; + init_completion(&entry->fs_kobj_unregister); + ret = kobject_init_and_add(&entry->fs_kobj, &ocfs2_ktype_filecheck, + NULL, "filecheck"); + if (ret) { + kfree(fcheck); + return ret; } - kfree(attrs); + entry->fs_fcheck = fcheck; return 0; - -error: - kfree(attrs); - kfree(entry); - kfree(fcheck); - kset_unregister(fcheck_kset); - kset_unregister(device_kset); - return ret; } -int ocfs2_filecheck_remove_sysfs(struct super_block *sb) +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb) { - return ocfs2_filecheck_sysfs_del(sb->s_id); + if (!osb->osb_fc_ent.fs_fcheck) + return; + + kobject_del(&osb->osb_fc_ent.fs_kobj); + kobject_put(&osb->osb_fc_ent.fs_kobj); + wait_for_completion(&osb->osb_fc_ent.fs_kobj_unregister); + ocfs2_filecheck_sysfs_free(&osb->osb_fc_ent); } static int @@ -360,7 +303,7 @@ ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count, return 0; } -static ssize_t ocfs2_filecheck_show(struct kobject *kobj, +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -368,19 +311,12 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj, ssize_t ret = 0, total = 0, remain = PAGE_SIZE; unsigned int type; struct ocfs2_filecheck_entry *p; - struct ocfs2_filecheck_sysfs_entry *ent; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); if (ocfs2_filecheck_type_parse(attr->attr.name, &type)) return -EINVAL; - ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); - if (!ent) { - mlog(ML_ERROR, - "Cannot get the corresponding entry via device basename %s\n", - kobj->name); - return -ENODEV; - } - if (type == OCFS2_FILECHECK_TYPE_SET) { spin_lock(&ent->fs_fcheck->fc_lock); total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max); @@ -414,11 +350,10 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj, spin_unlock(&ent->fs_fcheck->fc_lock); exit: - ocfs2_filecheck_sysfs_put(ent); return total; } -static int +static inline int ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) { struct ocfs2_filecheck_entry *p; @@ -464,14 +399,14 @@ ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, } static unsigned int -ocfs2_filecheck_handle(struct super_block *sb, +ocfs2_filecheck_handle(struct ocfs2_super *osb, unsigned long ino, unsigned int flags) { unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS; struct inode *inode = NULL; int rc; - inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0); + inode = ocfs2_iget(osb, ino, flags, 0); if (IS_ERR(inode)) { rc = (int)(-(long)inode); if (rc >= OCFS2_FILECHECK_ERR_START && @@ -489,11 +424,14 @@ static void ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, struct ocfs2_filecheck_entry *entry) { + struct ocfs2_super *osb = container_of(ent, struct ocfs2_super, + osb_fc_ent); + if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK) - entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_status = ocfs2_filecheck_handle(osb, entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK); else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX) - entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_status = ocfs2_filecheck_handle(osb, entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX); else entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED; @@ -501,30 +439,21 @@ ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, ocfs2_filecheck_done_entry(ent, entry); } -static ssize_t ocfs2_filecheck_store(struct kobject *kobj, +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + ssize_t ret = 0; struct ocfs2_filecheck_args args; struct ocfs2_filecheck_entry *entry; - struct ocfs2_filecheck_sysfs_entry *ent; - ssize_t ret = 0; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); if (count == 0) return count; - if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) { - mlog(ML_ERROR, "Invalid arguments for online file check\n"); + if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) return -EINVAL; - } - - ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); - if (!ent) { - mlog(ML_ERROR, - "Cannot get the corresponding entry via device basename %s\n", - kobj->parent->name); - return -ENODEV; - } if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) { ret = ocfs2_filecheck_adjust_max(ent, args.fa_len); @@ -539,7 +468,7 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, spin_lock(&ent->fs_fcheck->fc_lock); if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && - (ent->fs_fcheck->fc_done == 0)) { + (ent->fs_fcheck->fc_done == 0)) { mlog(ML_NOTICE, "Cannot do more file check " "since file check queue(%u) is full now\n", @@ -569,6 +498,5 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, ocfs2_filecheck_handle_entry(ent, entry); exit: - ocfs2_filecheck_sysfs_put(ent); return (!ret ? count : ret); } diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h index af1678b620a4..6a22ee79e8d0 100644 --- a/fs/ocfs2/filecheck.h +++ b/fs/ocfs2/filecheck.h @@ -51,15 +51,6 @@ struct ocfs2_filecheck { unsigned int fc_done; /* Finished entry count in list */ }; -struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ - struct list_head fs_list; - atomic_t fs_count; - struct super_block *fs_sb; - struct kset *fs_devicekset; - struct kset *fs_fcheckkset; - struct ocfs2_filecheck *fs_fcheck; -}; - #define OCFS2_FILECHECK_MAXSIZE 100 #define OCFS2_FILECHECK_MINSIZE 10 @@ -70,7 +61,14 @@ enum { OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ }; -int ocfs2_filecheck_create_sysfs(struct super_block *sb); -int ocfs2_filecheck_remove_sysfs(struct super_block *sb); +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per partition */ + struct kobject fs_kobj; + struct completion fs_kobj_unregister; + struct ocfs2_filecheck *fs_fcheck; +}; + + +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb); +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb); #endif /* FILECHECK_H */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 6867eef2e06b..4f86ac0027b5 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -50,6 +50,8 @@ #include "reservations.h" +#include "filecheck.h" + /* Caching of metadata buffers */ /* Most user visible OCFS2 inodes will have very few pieces of @@ -472,6 +474,12 @@ struct ocfs2_super * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq; + + /* sysfs directory per partition */ + struct kset *osb_dev_kset; + + /* file check related stuff */ + struct ocfs2_filecheck_sysfs_entry osb_fc_ent; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a49b7c2bde9e..3415e0b09398 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1161,6 +1161,23 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) ocfs2_complete_mount_recovery(osb); + osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, + &ocfs2_kset->kobj); + if (!osb->osb_dev_kset) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id); + goto read_super_error; + } + + /* Create filecheck sysfs related directories/files at + * /sys/fs/ocfs2//filecheck */ + if (ocfs2_filecheck_create_sysfs(osb)) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create filecheck sysfs directory at " + "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id); + goto read_super_error; + } + if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else @@ -1199,9 +1216,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Start this when the mount is almost sure of being successful */ ocfs2_orphan_scan_start(osb); - /* Create filecheck sysfile /sys/fs/ocfs2//filecheck */ - ocfs2_filecheck_create_sysfs(sb); - return status; read_super_error: @@ -1653,7 +1667,6 @@ static void ocfs2_put_super(struct super_block *sb) ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); - ocfs2_filecheck_remove_sysfs(sb); } static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1893,6 +1906,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); + /* Remove file check sysfs related directores/files, + * and wait for the pending file check operations */ + ocfs2_filecheck_remove_sysfs(osb); + + kset_unregister(osb->osb_dev_kset); + debugfs_remove(osb->osb_ctxt); /* Orphan scan should be stopped as early as possible */ -- cgit v1.2.3 From 39ec3774e3e8ebd5e45b0a5d071531d36f67c158 Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 5 Apr 2018 16:19:32 -0700 Subject: ocfs2: add duplicated ino number check Add duplicated ino number check, to avoid adding a file into the file check list when this file is being checked. Link: http://lkml.kernel.org/r/1495611866-27360-5-git-send-email-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/filecheck.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index a94c5310a59a..f65f2b2f594d 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -353,6 +353,22 @@ exit: return total; } +static inline int +ocfs2_filecheck_is_dup_entry(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned long ino) +{ + struct ocfs2_filecheck_entry *p; + + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (!p->fe_done) { + if (p->fe_ino == ino) + return 1; + } + } + + return 0; +} + static inline int ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) { @@ -467,7 +483,10 @@ static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, } spin_lock(&ent->fs_fcheck->fc_lock); - if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + if (ocfs2_filecheck_is_dup_entry(ent, args.fa_ino)) { + ret = -EEXIST; + kfree(entry); + } else if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && (ent->fs_fcheck->fc_done == 0)) { mlog(ML_NOTICE, "Cannot do more file check " -- cgit v1.2.3 From ba16ddfbeb9dde0df67cdb2006820e9cf4d99386 Mon Sep 17 00:00:00 2001 From: piaojun Date: Thu, 5 Apr 2018 16:19:36 -0700 Subject: ocfs2/o2hb: check len for bio_add_page() to avoid getting incorrect bio We need to check len for bio_add_page() to make sure the bio has been set up correctly, otherwise we may submit incorrect data to device. Link: http://lkml.kernel.org/r/5ABC3EBE.5020807@huawei.com Signed-off-by: Jun Piao Reviewed-by: Yiwen Jiang Reviewed-by: Changwei Ge Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ea8c551bcd7e..91a8889abf9b 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -570,7 +570,16 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, current_page, vec_len, vec_start); len = bio_add_page(bio, page, vec_len, vec_start); - if (len != vec_len) break; + if (len != vec_len) { + mlog(ML_ERROR, "Adding page[%d] to bio failed, " + "page %p, len %d, vec_len %u, vec_start %u, " + "bi_sector %llu\n", current_page, page, len, + vec_len, vec_start, + (unsigned long long)bio->bi_iter.bi_sector); + bio_put(bio); + bio = ERR_PTR(-EIO); + return bio; + } cs += vec_len / (PAGE_SIZE/spp); vec_start = 0; -- cgit v1.2.3 From de37428638ec7d22b46c4c20b8536f3681559e2c Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Thu, 5 Apr 2018 16:19:40 -0700 Subject: ocfs2/dlm: clean up unused variable in dlm_process_recovery_data Link: http://lkml.kernel.org/r/1522734135-7933-1-git-send-email-ge.changwei@h3c.com Signed-off-by: Changwei Ge Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index b454eb371b77..802636d50365 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1823,7 +1823,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, int i, j, bad; struct dlm_lock *lock; u8 from = O2NM_MAX_NODES; - unsigned int added = 0; __be64 c; mlog(0, "running %d locks for this lockres\n", mres->num_locks); @@ -1839,7 +1838,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); dlm_lockres_set_refmap_bit(dlm, res, from); spin_unlock(&res->spinlock); - added++; break; } BUG_ON(ml->highest_blocked != LKM_IVMODE); @@ -1927,7 +1925,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* do not alter lock refcount. switching lists. */ list_move_tail(&lock->list, queue); spin_unlock(&res->spinlock); - added++; mlog(0, "just reordered a local lock!\n"); continue; @@ -2053,7 +2050,6 @@ skip_lvb: "setting refmap bit\n", dlm->name, res->lockname.len, res->lockname.name, ml->node); dlm_lockres_set_refmap_bit(dlm, res, ml->node); - added++; } spin_unlock(&res->spinlock); } -- cgit v1.2.3 From a85222435bd055b2d2cedc515d810a5ea6c05432 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Thu, 5 Apr 2018 16:19:44 -0700 Subject: net/9p: avoid -ERESTARTSYS leak to userspace If it was interrupted by a signal, the 9p client may need to send some more requests to the server for cleanup before returning to userspace. To avoid such a last minute request to be interrupted right away, the client memorizes if a signal is pending, clears TIF_SIGPENDING, handles the request and calls recalc_sigpending() before returning. Unfortunately, if the transmission of this cleanup request fails for any reason, the transport returns an error and the client propagates it right away, without calling recalc_sigpending(). This ends up with -ERESTARTSYS from the initially interrupted request crawling up to syscall exit, with TIF_SIGPENDING cleared by the cleanup request. The specific signal handling code, which is responsible for converting -ERESTARTSYS to -EINTR is not called, and userspace receives the confusing errno value: open: Unknown error 512 (512) This is really hard to hit in real life. I discovered the issue while working on hot-unplug of a virtio-9p-pci device with an instrumented QEMU allowing to control request completion. Both p9_client_zc_rpc() and p9_client_rpc() functions have this buggy error path actually. Their code flow is a bit obscure and the best thing to do would probably be a full rewrite: to really ensure this situation of clearing TIF_SIGPENDING and returning -ERESTARTSYS can never happen. But given the general lack of interest for the 9p code, I won't risk breaking more things. So this patch simply fixes the buggy paths in both functions with a trivial label+goto. Thanks to Laurent Dufour for his help and suggestions on how to find the root cause and how to fix it. Link: http://lkml.kernel.org/r/152062809886.10599.7361006774123053312.stgit@bahia.lan Signed-off-by: Greg Kurz Reviewed-by: Andrew Morton Reviewed-by: Yiwen Jiang Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Cc: David Miller Cc: Laurent Dufour Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/9p/client.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index b433aff5ff13..e6cae8332e2e 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -769,7 +769,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) if (err < 0) { if (err != -ERESTARTSYS && err != -EFAULT) c->status = Disconnected; - goto reterr; + goto recalc_sigpending; } again: /* Wait for the response */ @@ -804,6 +804,7 @@ again: if (req->status == REQ_STATUS_RCVD) err = 0; } +recalc_sigpending: if (sigpending) { spin_lock_irqsave(¤t->sighand->siglock, flags); recalc_sigpending(); @@ -867,7 +868,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, if (err == -EIO) c->status = Disconnected; if (err != -ERESTARTSYS) - goto reterr; + goto recalc_sigpending; } if (req->status == REQ_STATUS_ERROR) { p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); @@ -885,6 +886,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, if (req->status == REQ_STATUS_RCVD) err = 0; } +recalc_sigpending: if (sigpending) { spin_lock_irqsave(¤t->sighand->siglock, flags); recalc_sigpending(); -- cgit v1.2.3 From ac89b2ef9b55924bcf922251f043ba73a32d05bb Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Thu, 5 Apr 2018 16:19:49 -0700 Subject: 9p: don't maintain dir i_nlink if the exported fs doesn't either If the exported filesystem dir on 9p server doesn't maintain accurate i_nlink count, e.g. always reports i_nlink as 1, then 9p should not maintain nlink count either, otherwise drop_link would report warning with i_nlink being zero. For example: - overlayfs sets nlink to 1 for merged dir - ext4 (with dir_nlink feature enabled) sets nlink to 1 if a dir has more than EXT4_LINK_MAX (65000) links. In this case, everytime a stat(2) call (getattr) on such exported dirs on 9p client side, the i_nlink gets reset to 1, then operations like rmdir(2), unlink(2) and rename(2) would cause the dir nlink to go to zero (then negative), which results in warnings in drop_nlink() and/or inc_nlink() calls. This can be reproduced easily as the following steps: - export a merged overlayfs dir via qemu virtfs to guest - mount the exported virtfs in guest - create two sub-directories in the root dir of the mounted 9pfs - stat the root dir of 9pfs, this resets nlink to 1 - remove all subdirs, the second unlink/rmdir would trigger warning ------------[ cut here ]------------ WARNING: CPU: 3 PID: 1284 at fs/inode.c:282 drop_nlink+0x3e/0x50 ... Call Trace: dump_stack+0x63/0x81 __warn+0xcb/0xf0 warn_slowpath_null+0x1d/0x20 drop_nlink+0x3e/0x50 v9fs_remove+0xaa/0x130 [9p] v9fs_vfs_rmdir+0x13/0x20 [9p] vfs_rmdir+0xb7/0x130 do_rmdir+0x1b8/0x230 SyS_unlinkat+0x22/0x30 do_syscall_64+0x67/0x180 ---[ end trace 43758d8ba91e603b ]--- Fix it by leaving i_nlink to be 1 and don't drop nlink if a directory has nlink <= 2, which indicates that the underlying exported fs doesn't maintain nlink count accurately. This follows what ext4 does in ext4_dec_count(). Link: http://lkml.kernel.org/r/20180312053829.4367-1-eguan@linux.alibaba.com Signed-off-by: Eryu Guan Reviewed-by: Yiwen Jiang Tested-by: Roman Kapl Cc: Caspar Zhang Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_inode.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index bdabb2765d1b..9ee534159cc6 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -578,6 +578,24 @@ static int v9fs_at_to_dotl_flags(int flags) return rflags; } +/** + * v9fs_dec_count - helper functon to drop i_nlink. + * + * If a directory had nlink <= 2 (including . and ..), then we should not drop + * the link count, which indicates the underlying exported fs doesn't maintain + * nlink accurately. e.g. + * - overlayfs sets nlink to 1 for merged dir + * - ext4 (with dir_nlink feature enabled) sets nlink to 1 if a dir has more + * than EXT4_LINK_MAX (65000) links. + * + * @inode: inode whose nlink is being dropped + */ +static void v9fs_dec_count(struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); +} + /** * v9fs_remove - helper function to remove files and directories * @dir: directory inode that is being deleted @@ -621,9 +639,9 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) */ if (flags & AT_REMOVEDIR) { clear_nlink(inode); - drop_nlink(dir); + v9fs_dec_count(dir); } else - drop_nlink(inode); + v9fs_dec_count(inode); v9fs_invalidate_inode_attr(inode); v9fs_invalidate_inode_attr(dir); @@ -1024,12 +1042,12 @@ clunk_newdir: if (S_ISDIR(new_inode->i_mode)) clear_nlink(new_inode); else - drop_nlink(new_inode); + v9fs_dec_count(new_inode); } if (S_ISDIR(old_inode->i_mode)) { if (!new_inode) inc_nlink(new_dir); - drop_nlink(old_dir); + v9fs_dec_count(old_dir); } v9fs_invalidate_inode_attr(old_inode); v9fs_invalidate_inode_attr(old_dir); -- cgit v1.2.3 From a25c36577ca788f9ea4b229baef1b6d436393a4c Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 5 Apr 2018 16:19:53 -0700 Subject: 9p: check memory allocation result for cachetag Check memory allocation result for cachetag in mount option parsing and fix potential memory leak in the error case. Link: http://lkml.kernel.org/r/1521614889-73446-1-git-send-email-cgxu519@gmx.com Signed-off-by: Chengguang Xu Reviewed-by: Andrew Morton Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/v9fs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 8fb89ddc6cc7..e622f0f10502 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -292,6 +292,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #ifdef CONFIG_9P_FSCACHE kfree(v9ses->cachetag); v9ses->cachetag = match_strdup(&args[0]); + if (!v9ses->cachetag) { + ret = -ENOMEM; + goto free_and_return; + } #endif break; case Opt_cache: @@ -471,6 +475,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, return fid; err_clnt: +#ifdef CONFIG_9P_FSCACHE + kfree(v9ses->cachetag); +#endif p9_client_destroy(v9ses->clnt); err_names: kfree(v9ses->uname); -- cgit v1.2.3 From 7ff3c2046803ac99d95de6d63cda46c84f72293b Mon Sep 17 00:00:00 2001 From: Yiwen Jiang Date: Thu, 5 Apr 2018 16:19:57 -0700 Subject: fs/9p: don't set SB_NOATIME by default When the user uses some syscall, for example mmap(v9fs_file_mmap), it will not update atime even if user's was set mnt_flags without MNT_NOATIME, because v9fs defaults to settine SB_NOATIME in v9fs_set_super. For supporting access time updating when the user mounts with relatime, we should not set SB_NOATIME by default. Link: http://lkml.kernel.org/r/5AB9A377.6080906@huawei.com Signed-off-by: Yiwen Jiang Reviewed-by: Greg Kurz Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index af03c2a901eb..48ce50484e80 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -94,7 +94,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, if (v9ses->cache) sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; - sb->s_flags |= SB_ACTIVE | SB_DIRSYNC | SB_NOATIME; + sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; if (!v9ses->cache) sb->s_flags |= SB_SYNCHRONOUS; -- cgit v1.2.3 From 9421c3e64137ec69e5cf4ed024dc777a09b7779f Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 5 Apr 2018 16:20:01 -0700 Subject: net/9p/client.c: fix potential refcnt problem of trans module When specifying trans_mod multiple times in a mount, it will cause an inaccurate refcount of the trans module. Also, in the error case of option parsing, we should put the trans module if we have already got it. Link: http://lkml.kernel.org/r/1522154942-57339-1-git-send-email-cgxu519@gmx.com Signed-off-by: Chengguang Xu Reviewed-by: Andrew Morton Cc: David Miller Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/9p/client.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/9p/client.c b/net/9p/client.c index e6cae8332e2e..21e6df1cc70f 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -190,7 +190,9 @@ static int parse_opts(char *opts, struct p9_client *clnt) p9_debug(P9_DEBUG_ERROR, "problem allocating copy of trans arg\n"); goto free_and_return; - } + } + + v9fs_put_trans(clnt->trans_mod); clnt->trans_mod = v9fs_get_trans_by_name(s); if (clnt->trans_mod == NULL) { pr_info("Could not find request transport: %s\n", @@ -226,6 +228,7 @@ static int parse_opts(char *opts, struct p9_client *clnt) } free_and_return: + v9fs_put_trans(clnt->trans_mod); kfree(tmp_options); return ret; } -- cgit v1.2.3 From 849cf55963dfd877ad92996195e0fe7218eaf0e9 Mon Sep 17 00:00:00 2001 From: shunki-fujita Date: Thu, 5 Apr 2018 16:20:07 -0700 Subject: fs: don't flush pagecache when expanding block device When changing the size of a block device, its all caches are freed. It's necessary on shrinking to prevent spurious I/Os to the disappeared region. However, on expanding, such kind of I/Os doesn't happen. Similar things can be considered for btrfs filesystem resize and resize2fs, but they are designed not to drop caches when expanding. Therefore this patch removes unnecessary cache drop. Link: http://lkml.kernel.org/r/1521457240-153390-1-git-send-email-shunki-fujita@cybozu.co.jp Signed-off-by: Shunki Fujita Cc: Al Viro Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index fe09ef9c21f3..7a506c55a993 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1324,7 +1324,8 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) * @bdev: struct bdev to adjust. * * This routine checks to see if the bdev size does not match the disk size - * and adjusts it if it differs. + * and adjusts it if it differs. When shrinking the bdev size, its all caches + * are freed. */ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) { @@ -1337,7 +1338,8 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) "%s: detected capacity change from %lld to %lld\n", disk->disk_name, bdev_size, disk_size); i_size_write(bdev->bd_inode, disk_size); - flush_disk(bdev, false); + if (bdev_size > disk_size) + flush_disk(bdev, false); } } EXPORT_SYMBOL(check_disk_size_change); -- cgit v1.2.3 From 1c99ba2918df372c7cb1fde81d8a6311fdd62aab Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:11 -0700 Subject: mm/slab_common.c: mark kmalloc machinery as __ro_after_init kmalloc caches aren't relocated after being set up neither does "size_index" array. Link: http://lkml.kernel.org/r/20180226203519.GA6886@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 10f127b2de7c..a1237d38a27e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -954,11 +955,11 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, return s; } -struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; EXPORT_SYMBOL(kmalloc_caches); #ifdef CONFIG_ZONE_DMA -struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; +struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; EXPORT_SYMBOL(kmalloc_dma_caches); #endif @@ -968,7 +969,7 @@ EXPORT_SYMBOL(kmalloc_dma_caches); * of two cache sizes there. The size of larger slabs can be determined using * fls. */ -static s8 size_index[24] = { +static s8 size_index[24] __ro_after_init = { 3, /* 8 */ 4, /* 16 */ 5, /* 24 */ -- cgit v1.2.3 From 86609d3319ad57ff65c20890391a035acad47bb1 Mon Sep 17 00:00:00 2001 From: Chintan Pandya Date: Thu, 5 Apr 2018 16:20:15 -0700 Subject: mm/slub.c: use jitter-free reference while printing age When SLUB_DEBUG catches some issues, it prints all the required debug info. However, in a few cases where allocation and free of the object has happened in a very short time, 'age' might be misleading. See the example below: ============================================================================= BUG kmalloc-256 (Tainted: G W O ): Poison overwritten ----------------------------------------------------------------------------- ... INFO: Allocated in binder_transaction+0x4b0/0x2448 age=731 cpu=3 pid=5314 ... INFO: Freed in binder_free_transaction+0x2c/0x58 age=735 cpu=6 pid=2079 ... Object fffffff14956a870: 6b 6b 6b 6b 6b 6b 6b 6b 67 6b 6b 6b 6b 6b 6b a5 kkkkkkkkgkkkk In this case, object got freed later but 'age' shows otherwise. This could be because, while printing this info, we print allocation traces first and free traces thereafter. In between, if we get schedule out or jiffies increment, (jiffies - t->when) could become meaningless. Use the jitter free reference to calculate age. New output will exactly be same. 'age' is still staying with single jiffies ref in both prints. Change-Id: I0846565807a4229748649bbecb1ffb743d71fcd8 Link: http://lkml.kernel.org/r/1520492010-19389-1-git-send-email-cpandya@codeaurora.org Signed-off-by: Chintan Pandya Acked-by: Christoph Lameter Reviewed-by: Andrew Morton Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index e381728a3751..d92218ed7f1c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -598,13 +598,13 @@ static void init_tracking(struct kmem_cache *s, void *object) set_track(s, object, TRACK_ALLOC, 0UL); } -static void print_track(const char *s, struct track *t) +static void print_track(const char *s, struct track *t, unsigned long pr_time) { if (!t->addr) return; pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); + s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { int i; @@ -619,11 +619,12 @@ static void print_track(const char *s, struct track *t) static void print_tracking(struct kmem_cache *s, void *object) { + unsigned long pr_time = jiffies; if (!(s->flags & SLAB_STORE_USER)) return; - print_track("Allocated", get_track(s, object, TRACK_ALLOC)); - print_track("Freed", get_track(s, object, TRACK_FREE)); + print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time); + print_track("Freed", get_track(s, object, TRACK_FREE), pr_time); } static void print_page_info(struct page *page) -- cgit v1.2.3 From c86305743bdf3928ed7fbc685724c04bbd5331aa Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:18 -0700 Subject: slab: fixup calculate_alignment() argument type Link: http://lkml.kernel.org/r/20180305200730.15812-1-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index a1237d38a27e..7626a64b8f14 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -280,7 +280,7 @@ static inline void memcg_unlink_cache(struct kmem_cache *s) * Figure out what the alignment of the objects will be given a set of * flags, a user specified alignment and the size of the objects. */ -static unsigned long calculate_alignment(unsigned long flags, +static unsigned long calculate_alignment(slab_flags_t flags, unsigned long align, unsigned long size) { /* -- cgit v1.2.3 From 36071a279b4100afe9fbee18727ad78daa307591 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:22 -0700 Subject: slab: make kmalloc_index() return "unsigned int" kmalloc_index() return index into an array of kmalloc kmem caches, therefore should be unsigned. Space savings with SLUB on trimmed down .config: add/remove: 0/1 grow/shrink: 6/56 up/down: 85/-557 (-472) Function old new delta calculate_sizes 924 983 +59 on_freelist 589 604 +15 init_cache_random_seq 122 127 +5 ext4_mb_init 1206 1210 +4 slab_pad_check.part 270 271 +1 cpu_partial_store 112 113 +1 usersize_show 28 27 -1 ... new_slab 1871 1837 -34 slab_order 204 - -204 This patch start a series of converting SLUB (mostly) to "unsigned int". 1) Most integers in the code are in fact unsigned entities: array indexes, lengths, buffer sizes, allocation orders. It is therefore better to use unsigned variables 2) Some integers in the code are either "size_t" or "unsigned long" for no reason. size_t usually comes from people trying to maintain type correctness and figuring out that "sizeof" operator returns size_t or memset/memcpy takes size_t so should everything passed to it. However the number of 4GB+ objects in the kernel is very small. Most, if not all, dynamically allocated objects with kmalloc() or kmem_cache_create() aren't actually big. Maintaining wide types doesn't do anything. 64-bit ops are bigger than 32-bit on our beloved x86_64, so try to not use 64-bit where it isn't necessary (read: everywhere where integers are integers not pointers) 3) in case of SLAB allocators, there are additional limitations *) page->inuse, page->objects are only 16-/15-bit, *) cache size was always 32-bit *) slab orders are small, order 20 is needed to go 64-bit on x86_64 (PAGE_SIZE << order) Basically everything is 32-bit except kmalloc(1ULL<<32) which gets shortcut through page allocator. Christoph said: : : That changes with large base page size on power and ARM64 f.e. but then : we do not want to encourage larger allocations through slab anyways. Link: http://lkml.kernel.org/r/20180305200730.15812-2-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 231abc8976c5..296f33a512eb 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -308,7 +308,7 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; * 2 = 129 .. 192 bytes * n = 2^(n-1)+1 .. 2^n */ -static __always_inline int kmalloc_index(size_t size) +static __always_inline unsigned int kmalloc_index(size_t size) { if (!size) return 0; @@ -504,7 +504,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) return kmalloc_large(size, flags); #ifndef CONFIG_SLOB if (!(flags & GFP_DMA)) { - int index = kmalloc_index(size); + unsigned int index = kmalloc_index(size); if (!index) return ZERO_SIZE_PTR; @@ -542,7 +542,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) #ifndef CONFIG_SLOB if (__builtin_constant_p(size) && size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) { - int i = kmalloc_index(size); + unsigned int i = kmalloc_index(size); if (!i) return ZERO_SIZE_PTR; -- cgit v1.2.3 From 0be70327ec8cf6dd6847cbd8b75ca51be864a6ea Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:26 -0700 Subject: slab: make kmalloc_size() return "unsigned int" kmalloc_size() derives size of kmalloc cache from internal index, which can't be negative. Propagate unsignedness a bit. Link: http://lkml.kernel.org/r/20180305200730.15812-3-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 ++-- mm/slab_common.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 296f33a512eb..ad157fbf3886 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -522,11 +522,11 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) * return size or 0 if a kmalloc cache for that * size does not exist */ -static __always_inline int kmalloc_size(int n) +static __always_inline unsigned int kmalloc_size(unsigned int n) { #ifndef CONFIG_SLOB if (n > 2) - return 1 << n; + return 1U << n; if (n == 1 && KMALLOC_MIN_SIZE <= 32) return 96; diff --git a/mm/slab_common.c b/mm/slab_common.c index 7626a64b8f14..d3f4209c297d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1138,9 +1138,9 @@ void __init create_kmalloc_caches(slab_flags_t flags) struct kmem_cache *s = kmalloc_caches[i]; if (s) { - int size = kmalloc_size(i); + unsigned int size = kmalloc_size(i); char *n = kasprintf(GFP_NOWAIT, - "dma-kmalloc-%d", size); + "dma-kmalloc-%u", size); BUG_ON(!n); kmalloc_dma_caches[i] = create_kmalloc_cache(n, -- cgit v1.2.3 From 55de8b9c60f2f6da9bf5c9144020882d07e62296 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:29 -0700 Subject: slab: make create_kmalloc_cache() work with 32-bit sizes KMALLOC_MAX_CACHE_SIZE is 32-bit so is the largest kmalloc cache size. Christoph said: : : Ok SLABs maximum allocation size is limited to 32M (see : include/linux/slab.h: : : #define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ : (MAX_ORDER + PAGE_SHIFT - 1) : 25) : : And SLUB/SLOB pass all larger requests to the page allocator anyways. Link: http://lkml.kernel.org/r/20180305200730.15812-4-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 8 ++++---- mm/slab_common.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 51813236e773..c8887965491b 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -77,7 +77,7 @@ extern struct kmem_cache *kmem_cache; /* A table of kmalloc cache names and sizes */ extern const struct kmalloc_info_struct { const char *name; - unsigned long size; + unsigned int size; } kmalloc_info[]; #ifndef CONFIG_SLOB @@ -93,9 +93,9 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t); /* Functions provided by the slab allocators */ int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); -extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, - slab_flags_t flags, size_t useroffset, - size_t usersize); +struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size, + slab_flags_t flags, unsigned int useroffset, + unsigned int usersize); extern void create_boot_cache(struct kmem_cache *, const char *name, size_t size, slab_flags_t flags, size_t useroffset, size_t usersize); diff --git a/mm/slab_common.c b/mm/slab_common.c index d3f4209c297d..f9afca292858 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -939,9 +939,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz s->refcount = -1; /* Exempt from merging for now */ } -struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, - slab_flags_t flags, size_t useroffset, - size_t usersize) +struct kmem_cache *__init create_kmalloc_cache(const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize) { struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); -- cgit v1.2.3 From 361d575e5c7a39c73a8a9bdd504c1d1274f280aa Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:33 -0700 Subject: slab: make create_boot_cache() work with 32-bit sizes struct kmem_cache::size has always been "int", all those "size_t size" are fake. Link: http://lkml.kernel.org/r/20180305200730.15812-5-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 4 ++-- mm/slab_common.c | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index c8887965491b..2a6d88044a56 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -97,8 +97,8 @@ struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size, slab_flags_t flags, unsigned int useroffset, unsigned int usersize); extern void create_boot_cache(struct kmem_cache *, const char *name, - size_t size, slab_flags_t flags, size_t useroffset, - size_t usersize); + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize); int slab_unmergeable(struct kmem_cache *s); struct kmem_cache *find_mergeable(size_t size, size_t align, diff --git a/mm/slab_common.c b/mm/slab_common.c index f9afca292858..2a7f09ce7c84 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -917,8 +917,9 @@ bool slab_is_available(void) #ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ -void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, - slab_flags_t flags, size_t useroffset, size_t usersize) +void __init create_boot_cache(struct kmem_cache *s, const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize) { int err; @@ -933,7 +934,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz err = __kmem_cache_create(s, flags); if (err) - panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n", + panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n", name, size, err); s->refcount = -1; /* Exempt from merging for now */ -- cgit v1.2.3 From f4957d5bd09165b165df851fbf8c658f7fcd9922 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:37 -0700 Subject: slab: make kmem_cache_create() work with 32-bit sizes struct kmem_cache::size and ::align were always 32-bit. Out of curiosity I created 4GB kmem_cache, it oopsed with division by 0. kmem_cache_create(1UL<<32+1) created 1-byte cache as expected. size_t doesn't work and never did. Link: http://lkml.kernel.org/r/20180305200730.15812-6-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 7 ++++--- mm/slab.c | 2 +- mm/slab.h | 6 +++--- mm/slab_common.c | 19 ++++++++++--------- mm/slub.c | 2 +- 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index ad157fbf3886..d36e8f03730e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -137,11 +137,12 @@ bool slab_is_available(void); extern bool usercopy_fallback; -struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, slab_flags_t flags, +struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, + unsigned int align, slab_flags_t flags, void (*ctor)(void *)); struct kmem_cache *kmem_cache_create_usercopy(const char *name, - size_t size, size_t align, slab_flags_t flags, + unsigned int size, unsigned int align, + slab_flags_t flags, size_t useroffset, size_t usersize, void (*ctor)(void *)); void kmem_cache_destroy(struct kmem_cache *); diff --git a/mm/slab.c b/mm/slab.c index 9095c3945425..ba25d8363eb2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1877,7 +1877,7 @@ slab_flags_t kmem_cache_flags(unsigned long object_size, } struct kmem_cache * -__kmem_cache_alias(const char *name, size_t size, size_t align, +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *cachep; diff --git a/mm/slab.h b/mm/slab.h index 2a6d88044a56..0809580428fe 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -101,11 +101,11 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, unsigned int useroffset, unsigned int usersize); int slab_unmergeable(struct kmem_cache *s); -struct kmem_cache *find_mergeable(size_t size, size_t align, +struct kmem_cache *find_mergeable(unsigned size, unsigned align, slab_flags_t flags, const char *name, void (*ctor)(void *)); #ifndef CONFIG_SLOB struct kmem_cache * -__kmem_cache_alias(const char *name, size_t size, size_t align, +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); slab_flags_t kmem_cache_flags(unsigned long object_size, @@ -113,7 +113,7 @@ slab_flags_t kmem_cache_flags(unsigned long object_size, void (*ctor)(void *)); #else static inline struct kmem_cache * -__kmem_cache_alias(const char *name, size_t size, size_t align, +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { return NULL; } diff --git a/mm/slab_common.c b/mm/slab_common.c index 2a7f09ce7c84..a4545a61a7c8 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -82,7 +82,7 @@ unsigned int kmem_cache_size(struct kmem_cache *s) EXPORT_SYMBOL(kmem_cache_size); #ifdef CONFIG_DEBUG_VM -static int kmem_cache_sanity_check(const char *name, size_t size) +static int kmem_cache_sanity_check(const char *name, unsigned int size) { struct kmem_cache *s = NULL; @@ -113,7 +113,7 @@ static int kmem_cache_sanity_check(const char *name, size_t size) return 0; } #else -static inline int kmem_cache_sanity_check(const char *name, size_t size) +static inline int kmem_cache_sanity_check(const char *name, unsigned int size) { return 0; } @@ -280,8 +280,8 @@ static inline void memcg_unlink_cache(struct kmem_cache *s) * Figure out what the alignment of the objects will be given a set of * flags, a user specified alignment and the size of the objects. */ -static unsigned long calculate_alignment(slab_flags_t flags, - unsigned long align, unsigned long size) +static unsigned int calculate_alignment(slab_flags_t flags, + unsigned int align, unsigned int size) { /* * If the user wants hardware cache aligned objects then follow that @@ -291,7 +291,7 @@ static unsigned long calculate_alignment(slab_flags_t flags, * alignment though. If that is greater then use it. */ if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign; + unsigned int ralign; ralign = cache_line_size(); while (size <= ralign / 2) @@ -331,7 +331,7 @@ int slab_unmergeable(struct kmem_cache *s) return 0; } -struct kmem_cache *find_mergeable(size_t size, size_t align, +struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, slab_flags_t flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -379,7 +379,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, } static struct kmem_cache *create_cache(const char *name, - size_t object_size, size_t size, size_t align, + unsigned int object_size, unsigned int size, unsigned int align, slab_flags_t flags, size_t useroffset, size_t usersize, void (*ctor)(void *), struct mem_cgroup *memcg, struct kmem_cache *root_cache) @@ -452,7 +452,8 @@ out_free_cache: * as davem. */ struct kmem_cache * -kmem_cache_create_usercopy(const char *name, size_t size, size_t align, +kmem_cache_create_usercopy(const char *name, + unsigned int size, unsigned int align, slab_flags_t flags, size_t useroffset, size_t usersize, void (*ctor)(void *)) { @@ -532,7 +533,7 @@ out_unlock: EXPORT_SYMBOL(kmem_cache_create_usercopy); struct kmem_cache * -kmem_cache_create(const char *name, size_t size, size_t align, +kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, diff --git a/mm/slub.c b/mm/slub.c index d92218ed7f1c..495c785c8d7e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4242,7 +4242,7 @@ void __init kmem_cache_init_late(void) } struct kmem_cache * -__kmem_cache_alias(const char *name, size_t size, size_t align, +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *s, *c; -- cgit v1.2.3 From d5f866550df237861f9d59ca0206434b0dea9701 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:40 -0700 Subject: slab: make size_index[] array u8 All those small numbers are reverse indexes into kmalloc caches array and can't be negative. On x86_64 "unsigned int = fls()" can drop CDQE instruction: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-2 (-2) Function old new delta kmalloc_slab 101 99 -2 Link: http://lkml.kernel.org/r/20180305200730.15812-7-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index a4545a61a7c8..dda966e6bc58 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -971,7 +971,7 @@ EXPORT_SYMBOL(kmalloc_dma_caches); * of two cache sizes there. The size of larger slabs can be determined using * fls. */ -static s8 size_index[24] __ro_after_init = { +static u8 size_index[24] __ro_after_init = { 3, /* 8 */ 4, /* 16 */ 5, /* 24 */ @@ -1009,7 +1009,7 @@ static inline int size_index_elem(size_t bytes) */ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) { - int index; + unsigned int index; if (unlikely(size > KMALLOC_MAX_SIZE)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); -- cgit v1.2.3 From ac914d08bbb6afdc089ca6651af1988533c9786c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:44 -0700 Subject: slab: make size_index_elem() unsigned int size_index_elem() always works with small sizes (kmalloc caches are 32-bit) and returns small indexes. Link: http://lkml.kernel.org/r/20180305200730.15812-8-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index dda966e6bc58..8abb2a46ae85 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -998,7 +998,7 @@ static u8 size_index[24] __ro_after_init = { 2 /* 192 */ }; -static inline int size_index_elem(size_t bytes) +static inline unsigned int size_index_elem(unsigned int bytes) { return (bytes - 1) / 8; } @@ -1067,13 +1067,13 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { */ void __init setup_kmalloc_cache_index_table(void) { - int i; + unsigned int i; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { - int elem = size_index_elem(i); + unsigned int elem = size_index_elem(i); if (elem >= ARRAY_SIZE(size_index)) break; -- cgit v1.2.3 From eb7235eb842043ca302e992286ca6af63a8127fe Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:48 -0700 Subject: slub: make ->remote_node_defrag_ratio unsigned int ->remote_node_defrag_ratio is in range 0..1000. This also adds a check and modifies the behavior to return an error code. Before this patch invalid values were ignored. Link: http://lkml.kernel.org/r/20180305200730.15812-9-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 2 +- mm/slub.c | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 8ad99c47b19c..f6548083fe0f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -124,7 +124,7 @@ struct kmem_cache { /* * Defragmentation by allocating from a remote node. */ - int remote_node_defrag_ratio; + unsigned int remote_node_defrag_ratio; #endif #ifdef CONFIG_SLAB_FREELIST_RANDOM diff --git a/mm/slub.c b/mm/slub.c index 495c785c8d7e..97dd2db30acb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5289,21 +5289,22 @@ SLAB_ATTR(shrink); #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); + return sprintf(buf, "%u\n", s->remote_node_defrag_ratio / 10); } static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, const char *buf, size_t length) { - unsigned long ratio; + unsigned int ratio; int err; - err = kstrtoul(buf, 10, &ratio); + err = kstrtouint(buf, 10, &ratio); if (err) return err; + if (ratio > 100) + return -ERANGE; - if (ratio <= 100) - s->remote_node_defrag_ratio = ratio * 10; + s->remote_node_defrag_ratio = ratio * 10; return length; } -- cgit v1.2.3 From 56d8ceebd39b4db3248291e6d1e3e696fc73b077 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:51 -0700 Subject: slub: make ->max_attr_size unsigned int ->max_attr_size is maximum length of every SLAB memcg attribute ever written. VFS limits those to INT_MAX. Link: http://lkml.kernel.org/r/20180305200730.15812-10-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index f6548083fe0f..9bb761324a9c 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -110,7 +110,8 @@ struct kmem_cache { #endif #ifdef CONFIG_MEMCG struct memcg_cache_params memcg_params; - int max_attr_size; /* for propagation, maximum size of a stored attr */ + /* for propagation, maximum size of a stored attr */ + unsigned int max_attr_size; #ifdef CONFIG_SYSFS struct kset *memcg_kset; #endif -- cgit v1.2.3 From 2ca6d39b31022bb9e1dda77109e292517f701261 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:55 -0700 Subject: slub: make ->red_left_pad unsigned int Padding length can't be negative. Link: http://lkml.kernel.org/r/20180305200730.15812-11-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 9bb761324a9c..9f59fc16444b 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -101,7 +101,7 @@ struct kmem_cache { int inuse; /* Offset to metadata */ int align; /* Alignment */ int reserved; /* Reserved bytes at the end of slabs */ - int red_left_pad; /* Left redzone padding size */ + unsigned int red_left_pad; /* Left redzone padding size */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ #ifdef CONFIG_SYSFS -- cgit v1.2.3 From d66e52d1e82b1adfab541f1aad09526ebf67842d Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:20:58 -0700 Subject: slub: make ->reserved unsigned int ->reserved is either 0 or sizeof(struct rcu_head), can't be negative. Link: http://lkml.kernel.org/r/20180305200730.15812-12-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 2 +- mm/slub.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 9f59fc16444b..2b4417aa15d8 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -100,7 +100,7 @@ struct kmem_cache { void (*ctor)(void *); int inuse; /* Offset to metadata */ int align; /* Alignment */ - int reserved; /* Reserved bytes at the end of slabs */ + unsigned int reserved; /* Reserved bytes at the end of slabs */ unsigned int red_left_pad; /* Left redzone padding size */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ diff --git a/mm/slub.c b/mm/slub.c index 97dd2db30acb..e38221c13b07 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5094,7 +5094,7 @@ SLAB_ATTR_RO(destroy_by_rcu); static ssize_t reserved_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->reserved); + return sprintf(buf, "%u\n", s->reserved); } SLAB_ATTR_RO(reserved); -- cgit v1.2.3 From 3a3791ec2ecd5db8d903b66faa340b0dfa72e64b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:02 -0700 Subject: slub: make ->align unsigned int Kmem cache alignment can't be negative. Link: http://lkml.kernel.org/r/20180305200730.15812-13-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 2 +- mm/slub.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 2b4417aa15d8..2a0eabeff78f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -99,7 +99,7 @@ struct kmem_cache { int refcount; /* Refcount for slab cache destroy */ void (*ctor)(void *); int inuse; /* Offset to metadata */ - int align; /* Alignment */ + unsigned int align; /* Alignment */ unsigned int reserved; /* Reserved bytes at the end of slabs */ unsigned int red_left_pad; /* Left redzone padding size */ const char *name; /* Name (only for display!) */ diff --git a/mm/slub.c b/mm/slub.c index e38221c13b07..7b081b4d2760 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4896,7 +4896,7 @@ SLAB_ATTR_RO(slab_size); static ssize_t align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->align); + return sprintf(buf, "%u\n", s->align); } SLAB_ATTR_RO(align); -- cgit v1.2.3 From 52ee6d74aa23a3c5d4472edf167f2bb47776a733 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:06 -0700 Subject: slub: make ->inuse unsigned int ->inuse is "the number of bytes in actual use by the object", can't be negative. Link: http://lkml.kernel.org/r/20180305200730.15812-14-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 2 +- mm/slub.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 2a0eabeff78f..2287b800474f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -98,7 +98,7 @@ struct kmem_cache { gfp_t allocflags; /* gfp flags to use on each alloc */ int refcount; /* Refcount for slab cache destroy */ void (*ctor)(void *); - int inuse; /* Offset to metadata */ + unsigned int inuse; /* Offset to metadata */ unsigned int align; /* Alignment */ unsigned int reserved; /* Reserved bytes at the end of slabs */ unsigned int red_left_pad; /* Left redzone padding size */ diff --git a/mm/slub.c b/mm/slub.c index 7b081b4d2760..6d08bc66f0e7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4256,12 +4256,11 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, * the complete object on kzalloc. */ s->object_size = max(s->object_size, (int)size); - s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); for_each_memcg_cache(c, s) { c->object_size = s->object_size; - c->inuse = max_t(int, c->inuse, - ALIGN(size, sizeof(void *))); + c->inuse = max(c->inuse, ALIGN(size, sizeof(void *))); } if (sysfs_slab_alias(s, name)) { -- cgit v1.2.3 From e5d9998f3e09359b372a037a6ac55ba235d95d57 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:10 -0700 Subject: slub: make ->cpu_partial unsigned int /* * cpu_partial determined the maximum number of objects * kept in the per cpu partial lists of a processor. */ Can't be negative. Link: http://lkml.kernel.org/r/20180305200730.15812-15-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 3 ++- mm/slub.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 2287b800474f..d2cc1391f17a 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -88,7 +88,8 @@ struct kmem_cache { int object_size; /* The size of an object without meta data */ int offset; /* Free pointer offset. */ #ifdef CONFIG_SLUB_CPU_PARTIAL - int cpu_partial; /* Number of per cpu partial objects to keep around */ + /* Number of per cpu partial objects to keep around */ + unsigned int cpu_partial; #endif struct kmem_cache_order_objects oo; diff --git a/mm/slub.c b/mm/slub.c index 6d08bc66f0e7..2e72f15a03ea 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1812,7 +1812,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, { struct page *page, *page2; void *object = NULL; - int available = 0; + unsigned int available = 0; int objects; /* @@ -4962,10 +4962,10 @@ static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, size_t length) { - unsigned long objects; + unsigned int objects; int err; - err = kstrtoul(buf, 10, &objects); + err = kstrtouint(buf, 10, &objects); if (err) return err; if (objects && !kmem_cache_has_cpu_partial(s)) -- cgit v1.2.3 From a5035de2c4472d6c58c60a7f8eaad8ed0084b8b2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:13 -0700 Subject: slub: make ->offset unsigned int ->offset is free pointer offset from the start of the object, can't be negative. Link: http://lkml.kernel.org/r/20180305200730.15812-16-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index d2cc1391f17a..db00dbd7e89f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -86,7 +86,7 @@ struct kmem_cache { unsigned long min_partial; int size; /* The size of an object including meta data */ int object_size; /* The size of an object without meta data */ - int offset; /* Free pointer offset. */ + unsigned int offset; /* Free pointer offset. */ #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; -- cgit v1.2.3 From 1b473f29d5dd766903ac2372ac04b07600f233d0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:17 -0700 Subject: slub: make ->object_size unsigned int Linux doesn't support negative length objects. Link: http://lkml.kernel.org/r/20180305200730.15812-17-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 2 +- mm/slab_common.c | 2 +- mm/slub.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index db00dbd7e89f..7d74f121ef4e 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -85,7 +85,7 @@ struct kmem_cache { slab_flags_t flags; unsigned long min_partial; int size; /* The size of an object including meta data */ - int object_size; /* The size of an object without meta data */ + unsigned int object_size;/* The size of an object without meta data */ unsigned int offset; /* Free pointer offset. */ #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 8abb2a46ae85..3e07b1fb22bd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -103,7 +103,7 @@ static int kmem_cache_sanity_check(const char *name, unsigned int size) */ res = probe_kernel_address(s->name, tmp); if (res) { - pr_err("Slab cache with size %d has lost its name\n", + pr_err("Slab cache with size %u has lost its name\n", s->object_size); continue; } diff --git a/mm/slub.c b/mm/slub.c index 2e72f15a03ea..7431cd548776 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -681,7 +681,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); print_section(KERN_ERR, "Object ", p, - min_t(unsigned long, s->object_size, PAGE_SIZE)); + min_t(unsigned int, s->object_size, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); @@ -2399,7 +2399,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", nid, gfpflags, &gfpflags); - pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", + pr_warn(" cache: %s, object size: %u, buffer size: %d, default order: %d, min order: %d\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); @@ -4255,7 +4255,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, * Adjust the object sizes so that we clear * the complete object on kzalloc. */ - s->object_size = max(s->object_size, (int)size); + s->object_size = max(s->object_size, size); s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); for_each_memcg_cache(c, s) { @@ -4901,7 +4901,7 @@ SLAB_ATTR_RO(align); static ssize_t object_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->object_size); + return sprintf(buf, "%u\n", s->object_size); } SLAB_ATTR_RO(object_size); -- cgit v1.2.3 From 44065b2e2975ff5987164b98d29cc78e207f9a5a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:20 -0700 Subject: slub: make ->size unsigned int Linux doesn't support negative length objects (including meta data). Link: http://lkml.kernel.org/r/20180305200730.15812-18-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 2 +- mm/slub.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 7d74f121ef4e..bc02fd3a8ccf 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -84,7 +84,7 @@ struct kmem_cache { /* Used for retriving partial slabs etc */ slab_flags_t flags; unsigned long min_partial; - int size; /* The size of an object including meta data */ + unsigned int size; /* The size of an object including meta data */ unsigned int object_size;/* The size of an object without meta data */ unsigned int offset; /* Free pointer offset. */ #ifdef CONFIG_SLUB_CPU_PARTIAL diff --git a/mm/slub.c b/mm/slub.c index 7431cd548776..ee11896c2a13 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2399,7 +2399,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", nid, gfpflags, &gfpflags); - pr_warn(" cache: %s, object size: %u, buffer size: %d, default order: %d, min order: %d\n", + pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %d, min order: %d\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); @@ -3633,8 +3633,8 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)s->size, s->size, + panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n", + s->name, s->size, s->size, oo_order(s->oo), s->offset, (unsigned long)flags); return -EINVAL; } @@ -3825,7 +3825,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, bool to_user) { struct kmem_cache *s; - unsigned long offset; + unsigned int offset; size_t object_size; /* Find object and usable object size. */ @@ -4889,7 +4889,7 @@ struct slab_attribute { static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->size); + return sprintf(buf, "%u\n", s->size); } SLAB_ATTR_RO(slab_size); @@ -5664,7 +5664,7 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'A'; if (p != name + 1) *p++ = '-'; - p += sprintf(p, "%07d", s->size); + p += sprintf(p, "%07u", s->size); BUG_ON(p > name + ID_STR_LENGTH - 1); return name; -- cgit v1.2.3 From 0293d1fdd677a09b816df0c7bfe8f60d1b9b956f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:24 -0700 Subject: slab: make kmem_cache_flags accept 32-bit object size Now that all sizes are properly typed, propagate "unsigned int" down the callgraph. Link: http://lkml.kernel.org/r/20180305200730.15812-19-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- mm/slab.h | 4 ++-- mm/slub.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index ba25d8363eb2..063a02d79c8e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1869,7 +1869,7 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) return 0; } -slab_flags_t kmem_cache_flags(unsigned long object_size, +slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { diff --git a/mm/slab.h b/mm/slab.h index 0809580428fe..8f1072f49285 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -108,7 +108,7 @@ struct kmem_cache * __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); -slab_flags_t kmem_cache_flags(unsigned long object_size, +slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)); #else @@ -117,7 +117,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { return NULL; } -static inline slab_flags_t kmem_cache_flags(unsigned long object_size, +static inline slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { diff --git a/mm/slub.c b/mm/slub.c index ee11896c2a13..b4a739f8f84d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1293,7 +1293,7 @@ out: __setup("slub_debug", setup_slub_debug); -slab_flags_t kmem_cache_flags(unsigned long object_size, +slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { @@ -1326,7 +1326,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} -slab_flags_t kmem_cache_flags(unsigned long object_size, +slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { -- cgit v1.2.3 From be4a7988b35db9e6f95dca818d5e94785840fb58 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:28 -0700 Subject: kasan: make kasan_cache_create() work with 32-bit slab cache sizes If SLAB doesn't support 4GB+ kmem caches (it never did), KASAN should not do it as well. Link: http://lkml.kernel.org/r/20180305200730.15812-20-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 4 ++-- mm/kasan/kasan.c | 12 ++++++------ mm/slab.c | 2 +- mm/slub.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d6459bd1376d..de784fd11d12 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -43,7 +43,7 @@ void kasan_unpoison_stack_above_sp_to(const void *watermark); void kasan_alloc_pages(struct page *page, unsigned int order); void kasan_free_pages(struct page *page, unsigned int order); -void kasan_cache_create(struct kmem_cache *cache, size_t *size, +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -92,7 +92,7 @@ static inline void kasan_alloc_pages(struct page *page, unsigned int order) {} static inline void kasan_free_pages(struct page *page, unsigned int order) {} static inline void kasan_cache_create(struct kmem_cache *cache, - size_t *size, + unsigned int *size, slab_flags_t *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index e13d911251e7..f7a5e1d1ba87 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -323,9 +323,9 @@ void kasan_free_pages(struct page *page, unsigned int order) * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. */ -static size_t optimal_redzone(size_t object_size) +static unsigned int optimal_redzone(unsigned int object_size) { - int rz = + return object_size <= 64 - 16 ? 16 : object_size <= 128 - 32 ? 32 : object_size <= 512 - 64 ? 64 : @@ -333,14 +333,13 @@ static size_t optimal_redzone(size_t object_size) object_size <= (1 << 14) - 256 ? 256 : object_size <= (1 << 15) - 512 ? 512 : object_size <= (1 << 16) - 1024 ? 1024 : 2048; - return rz; } -void kasan_cache_create(struct kmem_cache *cache, size_t *size, +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) { + unsigned int orig_size = *size; int redzone_adjust; - int orig_size = *size; /* Add alloc meta. */ cache->kasan_info.alloc_meta_offset = *size; @@ -358,7 +357,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, if (redzone_adjust > 0) *size += redzone_adjust; - *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size + + *size = min_t(unsigned int, KMALLOC_MAX_SIZE, + max(*size, cache->object_size + optimal_redzone(cache->object_size))); /* diff --git a/mm/slab.c b/mm/slab.c index 063a02d79c8e..fb106e8277b7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1994,7 +1994,7 @@ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; - size_t size = cachep->size; + unsigned int size = cachep->size; #if DEBUG #if FORCED_DEBUG diff --git a/mm/slub.c b/mm/slub.c index b4a739f8f84d..dfead847961c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3458,7 +3458,7 @@ static void set_cpu_partial(struct kmem_cache *s) static int calculate_sizes(struct kmem_cache *s, int forced_order) { slab_flags_t flags = s->flags; - size_t size = s->object_size; + unsigned int size = s->object_size; int order; /* -- cgit v1.2.3 From 7bbdb81ee3de73f2381ceec1bbee831f4c913b5c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:31 -0700 Subject: slab: make usercopy region 32-bit If kmem case sizes are 32-bit, then usecopy region should be too. Link: http://lkml.kernel.org/r/20180305200730.15812-21-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Cc: David Miller Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 +- include/linux/slab_def.h | 4 ++-- include/linux/slub_def.h | 4 ++-- include/net/sock.h | 4 ++-- mm/slab.h | 4 ++-- mm/slab_common.c | 7 ++++--- mm/slub.c | 2 +- 7 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index d36e8f03730e..04402c637171 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -143,7 +143,7 @@ struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, struct kmem_cache *kmem_cache_create_usercopy(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, - size_t useroffset, size_t usersize, + unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 7385547c04b1..d9228e4d0320 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -85,8 +85,8 @@ struct kmem_cache { unsigned int *random_seq; #endif - size_t useroffset; /* Usercopy region offset */ - size_t usersize; /* Usercopy region size */ + unsigned int useroffset; /* Usercopy region offset */ + unsigned int usersize; /* Usercopy region size */ struct kmem_cache_node *node[MAX_NUMNODES]; }; diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index bc02fd3a8ccf..623d6ba92036 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -137,8 +137,8 @@ struct kmem_cache { struct kasan_cache kasan_info; #endif - size_t useroffset; /* Usercopy region offset */ - size_t usersize; /* Usercopy region size */ + unsigned int useroffset; /* Usercopy region offset */ + unsigned int usersize; /* Usercopy region size */ struct kmem_cache_node *node[MAX_NUMNODES]; }; diff --git a/include/net/sock.h b/include/net/sock.h index 49bd2c1796b0..74d725fdbe0f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1114,8 +1114,8 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; slab_flags_t slab_flags; - size_t useroffset; /* Usercopy region offset */ - size_t usersize; /* Usercopy region size */ + unsigned int useroffset; /* Usercopy region offset */ + unsigned int usersize; /* Usercopy region size */ struct percpu_counter *orphan_count; diff --git a/mm/slab.h b/mm/slab.h index 8f1072f49285..e8981e811c45 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -22,8 +22,8 @@ struct kmem_cache { unsigned int size; /* The aligned/padded/added on size */ unsigned int align; /* Alignment as calculated */ slab_flags_t flags; /* Active flags on the slab */ - size_t useroffset; /* Usercopy region offset */ - size_t usersize; /* Usercopy region size */ + unsigned int useroffset;/* Usercopy region offset */ + unsigned int usersize; /* Usercopy region size */ const char *name; /* Slab name for sysfs */ int refcount; /* Use counter */ void (*ctor)(void *); /* Called on object slot creation */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 3e07b1fb22bd..01224cb90080 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -380,8 +380,8 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, static struct kmem_cache *create_cache(const char *name, unsigned int object_size, unsigned int size, unsigned int align, - slab_flags_t flags, size_t useroffset, - size_t usersize, void (*ctor)(void *), + slab_flags_t flags, unsigned int useroffset, + unsigned int usersize, void (*ctor)(void *), struct mem_cgroup *memcg, struct kmem_cache *root_cache) { struct kmem_cache *s; @@ -454,7 +454,8 @@ out_free_cache: struct kmem_cache * kmem_cache_create_usercopy(const char *name, unsigned int size, unsigned int align, - slab_flags_t flags, size_t useroffset, size_t usersize, + slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)) { struct kmem_cache *s = NULL; diff --git a/mm/slub.c b/mm/slub.c index dfead847961c..a10cf661cdae 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5081,7 +5081,7 @@ SLAB_ATTR_RO(cache_dma); static ssize_t usersize_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%zu\n", s->usersize); + return sprintf(buf, "%u\n", s->usersize); } SLAB_ATTR_RO(usersize); -- cgit v1.2.3 From 284b50ddcf32e3fe1cd39bfc9ad803704e6ca338 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:35 -0700 Subject: slub: make slab_index() return unsigned int slab_index() returns index of an object within a slab which is at most u15 (or u16?). Iterators additionally guarantee that "p >= addr". Link: http://lkml.kernel.org/r/20180305200730.15812-22-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index a10cf661cdae..5c5b9aed10a1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -311,7 +311,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) __p += (__s)->size, __idx++) /* Determine object index from a given position */ -static inline int slab_index(void *p, struct kmem_cache *s, void *addr) +static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) { return (p - addr) / s->size; } -- cgit v1.2.3 From 19af27aff901e401a5b79e5c974e881e4701162c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:39 -0700 Subject: slub: make struct kmem_cache_order_objects::x unsigned int struct kmem_cache_order_objects is for mixing order and number of objects, and orders aren't big enough to warrant 64-bit width. Propagate unsignedness down so that everything fits. !!! Patch assumes that "PAGE_SIZE << order" doesn't overflow. !!! Link: http://lkml.kernel.org/r/20180305200730.15812-23-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 2 +- mm/slub.c | 74 +++++++++++++++++++++++++----------------------- 2 files changed, 40 insertions(+), 36 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 623d6ba92036..3773e26c08c1 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -73,7 +73,7 @@ struct kmem_cache_cpu { * given order would contain. */ struct kmem_cache_order_objects { - unsigned long x; + unsigned int x; }; /* diff --git a/mm/slub.c b/mm/slub.c index 5c5b9aed10a1..21e9de8f8d3a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -316,13 +316,13 @@ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } -static inline int order_objects(int order, unsigned long size, int reserved) +static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved) { - return ((PAGE_SIZE << order) - reserved) / size; + return (((unsigned int)PAGE_SIZE << order) - reserved) / size; } -static inline struct kmem_cache_order_objects oo_make(int order, - unsigned long size, int reserved) +static inline struct kmem_cache_order_objects oo_make(unsigned int order, + unsigned int size, unsigned int reserved) { struct kmem_cache_order_objects x = { (order << OO_SHIFT) + order_objects(order, size, reserved) @@ -331,12 +331,12 @@ static inline struct kmem_cache_order_objects oo_make(int order, return x; } -static inline int oo_order(struct kmem_cache_order_objects x) +static inline unsigned int oo_order(struct kmem_cache_order_objects x) { return x.x >> OO_SHIFT; } -static inline int oo_objects(struct kmem_cache_order_objects x) +static inline unsigned int oo_objects(struct kmem_cache_order_objects x) { return x.x & OO_MASK; } @@ -1436,7 +1436,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_order_objects oo) { struct page *page; - int order = oo_order(oo); + unsigned int order = oo_order(oo); if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); @@ -1455,8 +1455,8 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, /* Pre-initialize the random sequence cache */ static int init_cache_random_seq(struct kmem_cache *s) { + unsigned int count = oo_objects(s->oo); int err; - unsigned long i, count = oo_objects(s->oo); /* Bailout if already initialised */ if (s->random_seq) @@ -1471,6 +1471,8 @@ static int init_cache_random_seq(struct kmem_cache *s) /* Transform to an offset on the set of pages */ if (s->random_seq) { + unsigned int i; + for (i = 0; i < count; i++) s->random_seq[i] *= s->size; } @@ -2399,7 +2401,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", nid, gfpflags, &gfpflags); - pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %d, min order: %d\n", + pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); @@ -3182,9 +3184,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); * and increases the number of allocations possible without having to * take the list_lock. */ -static int slub_min_order; -static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; -static int slub_min_objects; +static unsigned int slub_min_order; +static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static unsigned int slub_min_objects; /* * Calculate the order of allocation given an slab object size. @@ -3211,20 +3213,21 @@ static int slub_min_objects; * requested a higher mininum order then we start with that one instead of * the smallest order which will fit the object. */ -static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover, int reserved) +static inline unsigned int slab_order(unsigned int size, + unsigned int min_objects, unsigned int max_order, + unsigned int fract_leftover, unsigned int reserved) { - int order; - int rem; - int min_order = slub_min_order; + unsigned int min_order = slub_min_order; + unsigned int order; if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, get_order(min_objects * size + reserved)); + for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved)); order <= max_order; order++) { - unsigned long slab_size = PAGE_SIZE << order; + unsigned int slab_size = (unsigned int)PAGE_SIZE << order; + unsigned int rem; rem = (slab_size - reserved) % size; @@ -3235,12 +3238,11 @@ static inline int slab_order(int size, int min_objects, return order; } -static inline int calculate_order(int size, int reserved) +static inline int calculate_order(unsigned int size, unsigned int reserved) { - int order; - int min_objects; - int fraction; - int max_objects; + unsigned int order; + unsigned int min_objects; + unsigned int max_objects; /* * Attempt to find best configuration for a slab. This @@ -3257,6 +3259,8 @@ static inline int calculate_order(int size, int reserved) min_objects = min(min_objects, max_objects); while (min_objects > 1) { + unsigned int fraction; + fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, @@ -3459,7 +3463,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; - int order; + unsigned int order; /* * Round up object size to the next word boundary. We can only @@ -3549,7 +3553,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) else order = calculate_order(size, s->reserved); - if (order < 0) + if ((int)order < 0) return 0; s->allocflags = 0; @@ -3717,7 +3721,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) static int __init setup_slub_min_order(char *str) { - get_option(&str, &slub_min_order); + get_option(&str, (int *)&slub_min_order); return 1; } @@ -3726,8 +3730,8 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { - get_option(&str, &slub_max_order); - slub_max_order = min(slub_max_order, MAX_ORDER - 1); + get_option(&str, (int *)&slub_max_order); + slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1); return 1; } @@ -3736,7 +3740,7 @@ __setup("slub_max_order=", setup_slub_max_order); static int __init setup_slub_min_objects(char *str) { - get_option(&str, &slub_min_objects); + get_option(&str, (int *)&slub_min_objects); return 1; } @@ -4231,7 +4235,7 @@ void __init kmem_cache_init(void) cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, slub_cpu_dead); - pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%u, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); @@ -4907,17 +4911,17 @@ SLAB_ATTR_RO(object_size); static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", oo_objects(s->oo)); + return sprintf(buf, "%u\n", oo_objects(s->oo)); } SLAB_ATTR_RO(objs_per_slab); static ssize_t order_store(struct kmem_cache *s, const char *buf, size_t length) { - unsigned long order; + unsigned int order; int err; - err = kstrtoul(buf, 10, &order); + err = kstrtouint(buf, 10, &order); if (err) return err; @@ -4930,7 +4934,7 @@ static ssize_t order_store(struct kmem_cache *s, static ssize_t order_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", oo_order(s->oo)); + return sprintf(buf, "%u\n", oo_order(s->oo)); } SLAB_ATTR(order); -- cgit v1.2.3 From 870b1fbb036c0e115aeb8a48478643c16b73d204 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:43 -0700 Subject: slub: make size_from_object() return unsigned int Function returns size of the object without red zone which can't be negative. Link: http://lkml.kernel.org/r/20180305200730.15812-24-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 21e9de8f8d3a..f48d942a487e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -466,7 +466,7 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } -static inline int size_from_object(struct kmem_cache *s) +static inline unsigned int size_from_object(struct kmem_cache *s) { if (s->flags & SLAB_RED_ZONE) return s->size - s->red_left_pad; -- cgit v1.2.3 From 302d55d51d04ab92f8b3f6a89d3424299e3efdf7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Apr 2018 16:21:46 -0700 Subject: slab: use 32-bit arithmetic in freelist_randomize() SLAB doesn't support 4GB+ of objects per slab, therefore randomization doesn't need size_t. Link: http://lkml.kernel.org/r/20180305200730.15812-25-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 01224cb90080..e2e2485b3496 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1186,10 +1186,10 @@ EXPORT_SYMBOL(kmalloc_order_trace); #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Randomize a generic freelist */ static void freelist_randomize(struct rnd_state *state, unsigned int *list, - size_t count) + unsigned int count) { - size_t i; unsigned int rand; + unsigned int i; for (i = 0; i < count; i++) list[i] = i; -- cgit v1.2.3 From 613a5eb5677923fdaecfa582738c7bcf80abe186 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 5 Apr 2018 16:21:50 -0700 Subject: slab, slub: remove size disparity on debug kernel I have noticed on debug kernel with SLAB, the size of some non-root slabs were larger than their corresponding root slabs. e.g. for radix_tree_node: $cat /proc/slabinfo | grep radix name ... radix_tree_node 15052 15075 4096 1 1 ... $cat /cgroup/memory/temp/memory.kmem.slabinfo | grep radix name ... radix_tree_node 1581 158 4120 1 2 ... However for SLUB in debug kernel, the sizes were same. On further inspection it is found that SLUB always use kmem_cache.object_size to measure the kmem_cache.size while SLAB use the given kmem_cache.size. In the debug kernel the slab's size can be larger than its object_size. Thus in the creation of non-root slab, the SLAB uses the root's size as base to calculate the non-root slab's size and thus non-root slab's size can be larger than the root slab's size. For SLUB, the non-root slab's size is measured based on the root's object_size and thus the size will remain same for root and non-root slab. This patch makes slab's object_size the default base to measure the slab's size. Link: http://lkml.kernel.org/r/20180313165428.58699-1-shakeelb@google.com Fixes: 794b1248be4e ("memcg, slab: separate memcg vs root cache creation paths") Signed-off-by: Shakeel Butt Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index e2e2485b3496..4a4bd71ab728 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -379,7 +379,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, } static struct kmem_cache *create_cache(const char *name, - unsigned int object_size, unsigned int size, unsigned int align, + unsigned int object_size, unsigned int align, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *), struct mem_cgroup *memcg, struct kmem_cache *root_cache) @@ -396,8 +396,7 @@ static struct kmem_cache *create_cache(const char *name, goto out; s->name = name; - s->object_size = object_size; - s->size = size; + s->size = s->object_size = object_size; s->align = align; s->ctor = ctor; s->useroffset = useroffset; @@ -503,7 +502,7 @@ kmem_cache_create_usercopy(const char *name, goto out_unlock; } - s = create_cache(cache_name, size, size, + s = create_cache(cache_name, size, calculate_alignment(flags, align, size), flags, useroffset, usersize, ctor, NULL, NULL); if (IS_ERR(s)) { @@ -650,7 +649,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; s = create_cache(cache_name, root_cache->object_size, - root_cache->size, root_cache->align, + root_cache->align, root_cache->flags & CACHE_CREATE_MASK, root_cache->useroffset, root_cache->usersize, root_cache->ctor, memcg, root_cache); -- cgit v1.2.3 From 1ba586de22909f48db78682ee791e0213aba73ae Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 5 Apr 2018 16:21:53 -0700 Subject: mm/slab_common.c: remove test if cache name is accessible Since commit db265eca7700 ("mm/sl[aou]b: Move duping of slab name to slab_common.c"), the kernel always duplicates the slab cache name when creating a slab cache, so the test if the slab name is accessible is useless. Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1803231133310.22626@file01.intranet.prod.int.rdu2.redhat.com Signed-off-by: Mikulas Patocka Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 4a4bd71ab728..2e682a4c8877 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -84,31 +84,12 @@ EXPORT_SYMBOL(kmem_cache_size); #ifdef CONFIG_DEBUG_VM static int kmem_cache_sanity_check(const char *name, unsigned int size) { - struct kmem_cache *s = NULL; - if (!name || in_interrupt() || size < sizeof(void *) || size > KMALLOC_MAX_SIZE) { pr_err("kmem_cache_create(%s) integrity check failed\n", name); return -EINVAL; } - list_for_each_entry(s, &slab_caches, list) { - char tmp; - int res; - - /* - * This happens when the module gets unloaded and doesn't - * destroy its slab cache and no-one else reuses the vmalloc - * area of the module. Print a warning. - */ - res = probe_kernel_address(s->name, tmp); - if (res) { - pr_err("Slab cache with size %u has lost its name\n", - s->object_size); - continue; - } - } - WARN_ON(strchr(name, ' ')); /* It confuses parsers */ return 0; } -- cgit v1.2.3 From f9e13c0a5a33d1eaec374d6d4dab53a4f72756a0 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 5 Apr 2018 16:21:57 -0700 Subject: slab, slub: skip unnecessary kasan_cache_shutdown() The kasan quarantine is designed to delay freeing slab objects to catch use-after-free. The quarantine can be large (several percent of machine memory size). When kmem_caches are deleted related objects are flushed from the quarantine but this requires scanning the entire quarantine which can be very slow. We have seen the kernel busily working on this while holding slab_mutex and badly affecting cache_reaper, slabinfo readers and memcg kmem cache creations. It can easily reproduced by following script: yes . | head -1000000 | xargs stat > /dev/null for i in `seq 1 10`; do seq 500 | (cd /cg/memory && xargs mkdir) seq 500 | xargs -I{} sh -c 'echo $BASHPID > \ /cg/memory/{}/tasks && exec stat .' > /dev/null seq 500 | (cd /cg/memory && xargs rmdir) done The busy stack: kasan_cache_shutdown shutdown_cache memcg_destroy_kmem_caches mem_cgroup_css_free css_free_rwork_fn process_one_work worker_thread kthread ret_from_fork This patch is based on the observation that if the kmem_cache to be destroyed is empty then there should not be any objects of this cache in the quarantine. Without the patch the script got stuck for couple of hours. With the patch the script completed within a second. Link: http://lkml.kernel.org/r/20180327230603.54721-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Andrew Morton Acked-by: Andrey Ryabinin Acked-by: Christoph Lameter Cc: Vladimir Davydov Cc: Alexander Potapenko Cc: Greg Thelen Cc: Dmitry Vyukov Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 3 ++- mm/slab.c | 12 ++++++++++++ mm/slab.h | 1 + mm/slub.c | 11 +++++++++++ 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index f7a5e1d1ba87..bc0e68f7dc75 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -382,7 +382,8 @@ void kasan_cache_shrink(struct kmem_cache *cache) void kasan_cache_shutdown(struct kmem_cache *cache) { - quarantine_remove_cache(cache); + if (!__kmem_cache_empty(cache)) + quarantine_remove_cache(cache); } size_t kasan_metadata_size(struct kmem_cache *cache) diff --git a/mm/slab.c b/mm/slab.c index fb106e8277b7..e3a9b8e23306 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2291,6 +2291,18 @@ out: return nr_freed; } +bool __kmem_cache_empty(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) + if (!list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial)) + return false; + return true; +} + int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0; diff --git a/mm/slab.h b/mm/slab.h index e8981e811c45..68bdf498da3b 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -166,6 +166,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, SLAB_TEMPORARY | \ SLAB_ACCOUNT) +bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *); diff --git a/mm/slub.c b/mm/slub.c index f48d942a487e..4fb037c98782 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3696,6 +3696,17 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) discard_slab(s, page); } +bool __kmem_cache_empty(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) + if (n->nr_partial || slabs_node(s, node)) + return false; + return true; +} + /* * Release all resources used by a slab cache. */ -- cgit v1.2.3 From c01f0b54efb1519f513089984c163ad3f6fea6eb Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 5 Apr 2018 16:22:01 -0700 Subject: mm/ksm.c: make stable_node_dup() static stable_node_dup() is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: mm/ksm.c:1321:13: warning: symbol 'stable_node_dup' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20180206221005.12642-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index adb5f991da8e..7596e14bc233 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1318,10 +1318,10 @@ bool is_page_sharing_candidate(struct stable_node *stable_node) return __is_page_sharing_candidate(stable_node, 0); } -struct page *stable_node_dup(struct stable_node **_stable_node_dup, - struct stable_node **_stable_node, - struct rb_root *root, - bool prune_stale_stable_nodes) +static struct page *stable_node_dup(struct stable_node **_stable_node_dup, + struct stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) { struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; -- cgit v1.2.3 From 57a7702b12bc610393bf7764d25183392344ee92 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Apr 2018 16:22:05 -0700 Subject: mm: always print RLIMIT_DATA warning The documentation for ignore_rlimit_data says that it will print a warning at first misuse. Yet it doesn't seem to do that. Fix the code to print the warning even when we allow the process to continue. Link: http://lkml.kernel.org/r/1517935505-9321-1-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Acked-by: Konstantin Khlebnikov Cc: Cyrill Gorcunov Cc: Vladimir Davydov Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index aa0dc8231c0d..f2154fc2548b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3191,13 +3191,15 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) if (rlimit(RLIMIT_DATA) == 0 && mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) return true; - if (!ignore_rlimit_data) { - pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits or use boot option ignore_rlimit_data.\n", - current->comm, current->pid, - (mm->data_vm + npages) << PAGE_SHIFT, - rlimit(RLIMIT_DATA)); + + pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n", + current->comm, current->pid, + (mm->data_vm + npages) << PAGE_SHIFT, + rlimit(RLIMIT_DATA), + ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data"); + + if (!ignore_rlimit_data) return false; - } } return true; -- cgit v1.2.3 From 310253514bbf179c5f82e20a7a4bbf07abc7f5ad Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 5 Apr 2018 16:22:08 -0700 Subject: mm/migrate: rename migration reason MR_CMA to MR_CONTIG_RANGE alloc_contig_range() initiates compaction and eventual migration for the purpose of either CMA or HugeTLB allocations. At present, the reason code remains the same MR_CMA for either of these cases. Let's make it MR_CONTIG_RANGE which will appropriately reflect the reason code in both these cases. Link: http://lkml.kernel.org/r/20180202091518.18798-1-khandual@linux.vnet.ibm.com Signed-off-by: Anshuman Khandual Acked-by: Michal Hocko Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/mmu_context_iommu.c | 2 +- include/linux/migrate.h | 2 +- include/trace/events/migrate.h | 2 +- mm/page_alloc.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index e0a2d8e806ed..9a8a084e4aba 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -112,7 +112,7 @@ static int mm_iommu_move_page_from_cma(struct page *page) put_page(page); /* Drop the gup reference */ ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page, - NULL, 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE); if (ret) { if (!list_empty(&cma_migrate_pages)) putback_movable_pages(&cma_migrate_pages); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index a2246cf670ba..ab45f8a0d288 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -25,7 +25,7 @@ enum migrate_reason { MR_SYSCALL, /* also applies to cpusets */ MR_MEMPOLICY_MBIND, MR_NUMA_MISPLACED, - MR_CMA, + MR_CONTIG_RANGE, MR_TYPES }; diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index bcf4daccd6be..711372845945 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -20,7 +20,7 @@ EM( MR_SYSCALL, "syscall_or_cpuset") \ EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ - EMe(MR_CMA, "cma") + EMe(MR_CONTIG_RANGE, "contig_range") /* * First define the enums in the above macros to be exported to userspace diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ea018263210..531d6acb0106 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7591,7 +7591,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - NULL, 0, cc->mode, MR_CMA); + NULL, 0, cc->mode, MR_CONTIG_RANGE); } if (ret < 0) { putback_movable_pages(&cc->migratepages); -- cgit v1.2.3 From 8e7a0c9100cade3bdbf851206b892b6f98eb39c9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 5 Apr 2018 16:22:23 -0700 Subject: mm/swap_slots.c: use conditional compilation For mm/swap_slots.c, use the traditional Linux method of conditional compilation and linking instead of always compiling it by using #ifdef CONFIG_SWAP and #endif for the entire source file (excluding header files). Link: http://lkml.kernel.org/r/c2a47015-0b5a-d0d9-8bc7-9984c049df20@infradead.org Signed-off-by: Randy Dunlap Acked-by: Tim Chen Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 4 ++-- mm/swap_slots.c | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/Makefile b/mm/Makefile index e669f02c5a54..b4e54a9ae9c5 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,7 +37,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o vmacache.o swap_slots.o \ + compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ debug.o $(mmu-y) @@ -55,7 +55,7 @@ ifdef CONFIG_MMU endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o diff --git a/mm/swap_slots.c b/mm/swap_slots.c index bebc19292018..f2641894f440 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -34,8 +34,6 @@ #include #include -#ifdef CONFIG_SWAP - static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); static bool swap_slot_cache_active; bool swap_slot_cache_enabled; @@ -356,5 +354,3 @@ repeat: return entry; } - -#endif /* CONFIG_SWAP */ -- cgit v1.2.3 From 3a2d7fa8a3d5ae740bd0c21d933acc6220857ed0 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 5 Apr 2018 16:22:27 -0700 Subject: mm: disable interrupts while initializing deferred pages Vlastimil Babka reported about a window issue during which when deferred pages are initialized, and the current version of on-demand initialization is finished, allocations may fail. While this is highly unlikely scenario, since this kind of allocation request must be large, and must come from interrupt handler, we still want to cover it. We solve this by initializing deferred pages with interrupts disabled, and holding node_size_lock spin lock while pages in the node are being initialized. The on-demand deferred page initialization that comes later will use the same lock, and thus synchronize with deferred_init_memmap(). It is unlikely for threads that initialize deferred pages to be interrupted. They run soon after smp_init(), but before modules are initialized, and long before user space programs. This is why there is no adverse effect of having these threads running with interrupts disabled. [pasha.tatashin@oracle.com: v6] Link: http://lkml.kernel.org/r/20180313182355.17669-2-pasha.tatashin@oracle.com Link: http://lkml.kernel.org/r/20180309220807.24961-2-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Andrew Morton Cc: Steven Sistare Cc: Daniel Jordan Cc: Masayoshi Mizuma Cc: Michal Hocko Cc: Catalin Marinas Cc: AKASHI Takahiro Cc: Gioh Kim Cc: Heiko Carstens Cc: Yaowei Bai Cc: Wei Yang Cc: Paul Burton Cc: Miles Chen Cc: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 53 ++++++++++++++++++++++-------------------- include/linux/mmzone.h | 5 ++-- mm/page_alloc.c | 19 ++++++++------- 3 files changed, 42 insertions(+), 35 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index aba5f86eb038..2b0265265c28 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -51,24 +51,6 @@ enum { MMOP_ONLINE_MOVABLE, }; -/* - * pgdat resizing functions - */ -static inline -void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) -{ - spin_lock_irqsave(&pgdat->node_size_lock, *flags); -} -static inline -void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) -{ - spin_unlock_irqrestore(&pgdat->node_size_lock, *flags); -} -static inline -void pgdat_resize_init(struct pglist_data *pgdat) -{ - spin_lock_init(&pgdat->node_size_lock); -} /* * Zone resizing functions * @@ -246,13 +228,6 @@ extern void clear_zone_contiguous(struct zone *zone); ___page; \ }) -/* - * Stub functions for when hotplug is off - */ -static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} -static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} -static inline void pgdat_resize_init(struct pglist_data *pgdat) {} - static inline unsigned zone_span_seqbegin(struct zone *zone) { return 0; @@ -293,6 +268,34 @@ static inline bool movable_node_is_enabled(void) } #endif /* ! CONFIG_MEMORY_HOTPLUG */ +#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) +/* + * pgdat resizing functions + */ +static inline +void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_lock_irqsave(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_unlock_irqrestore(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_init(struct pglist_data *pgdat) +{ + spin_lock_init(&pgdat->node_size_lock); +} +#else /* !(CONFIG_MEMORY_HOTPLUG || CONFIG_DEFERRED_STRUCT_PAGE_INIT) */ +/* + * Stub functions for when hotplug is off + */ +static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_init(struct pglist_data *pgdat) {} +#endif /* !(CONFIG_MEMORY_HOTPLUG || CONFIG_DEFERRED_STRUCT_PAGE_INIT) */ + #ifdef CONFIG_MEMORY_HOTREMOVE extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a2db4576e499..5d935411d3c4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -633,14 +633,15 @@ typedef struct pglist_data { #ifndef CONFIG_NO_BOOTMEM struct bootmem_data *bdata; #endif -#ifdef CONFIG_MEMORY_HOTPLUG +#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * Must be held any time you expect node_start_pfn, node_present_pages * or node_spanned_pages stay constant. Holding this will also * guarantee that any pfn_valid() stays that way. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to - * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG. + * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG + * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. * * Nests above zone->lock and zone->span_seqlock */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 531d6acb0106..cf5555df78bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1506,7 +1506,7 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, } else if (!(pfn & nr_pgmask)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; - cond_resched(); + touch_nmi_watchdog(); } else { nr_free++; } @@ -1535,7 +1535,7 @@ static unsigned long __init deferred_init_pages(int nid, int zid, continue; } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - cond_resched(); + touch_nmi_watchdog(); } else { page++; } @@ -1552,23 +1552,25 @@ static int __init deferred_init_memmap(void *data) int nid = pgdat->node_id; unsigned long start = jiffies; unsigned long nr_pages = 0; - unsigned long spfn, epfn; + unsigned long spfn, epfn, first_init_pfn, flags; phys_addr_t spa, epa; int zid; struct zone *zone; - unsigned long first_init_pfn = pgdat->first_deferred_pfn; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); u64 i; + /* Bind memory initialisation thread to a local node if possible */ + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(current, cpumask); + + pgdat_resize_lock(pgdat, &flags); + first_init_pfn = pgdat->first_deferred_pfn; if (first_init_pfn == ULONG_MAX) { + pgdat_resize_unlock(pgdat, &flags); pgdat_init_report_one_done(); return 0; } - /* Bind memory initialisation thread to a local node if possible */ - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(current, cpumask); - /* Sanity check boundaries */ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); @@ -1598,6 +1600,7 @@ static int __init deferred_init_memmap(void *data) epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); deferred_free_pages(nid, zid, spfn, epfn); } + pgdat_resize_unlock(pgdat, &flags); /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); -- cgit v1.2.3 From c9e97a1997fbf3a1d18d4065c2ca381f0704d7e5 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 5 Apr 2018 16:22:31 -0700 Subject: mm: initialize pages on demand during boot Deferred page initialization allows the boot cpu to initialize a small subset of the system's pages early in boot, with other cpus doing the rest later on. It is, however, problematic to know how many pages the kernel needs during boot. Different modules and kernel parameters may change the requirement, so the boot cpu either initializes too many pages or runs out of memory. To fix that, initialize early pages on demand. This ensures the kernel does the minimum amount of work to initialize pages during boot and leaves the rest to be divided in the multithreaded initialization path (deferred_init_memmap). The on-demand code is permanently disabled using static branching once deferred pages are initialized. After the static branch is changed to false, the overhead is up-to two branch-always instructions if the zone watermark check fails or if rmqueue fails. Sergey Senozhatsky noticed that while deferred pages currently make sense only on NUMA machines (we start one thread per latency node), CONFIG_NUMA is not a requirement for CONFIG_DEFERRED_STRUCT_PAGE_INIT, so that is also must be addressed in the patch. [akpm@linux-foundation.org: fix typo in comment, make deferred_pages static] [pasha.tatashin@oracle.com: fix min() type mismatch warning] Link: http://lkml.kernel.org/r/20180212164543.26592-1-pasha.tatashin@oracle.com [pasha.tatashin@oracle.com: use zone_to_nid() in deferred_grow_zone()] Link: http://lkml.kernel.org/r/20180214163343.21234-2-pasha.tatashin@oracle.com [pasha.tatashin@oracle.com: might_sleep warning] Link: http://lkml.kernel.org/r/20180306192022.28289-1-pasha.tatashin@oracle.com [akpm@linux-foundation.org: s/spin_lock/spin_lock_irq/ in page_alloc_init_late()] [pasha.tatashin@oracle.com: v5] Link: http://lkml.kernel.org/r/20180309220807.24961-3-pasha.tatashin@oracle.com [akpm@linux-foundation.org: tweak comments] [pasha.tatashin@oracle.com: v6] Link: http://lkml.kernel.org/r/20180313182355.17669-3-pasha.tatashin@oracle.com [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20180209192216.20509-2-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Daniel Jordan Reviewed-by: Steven Sistare Reviewed-by: Andrew Morton Tested-by: Masayoshi Mizuma Acked-by: Mel Gorman Cc: Michal Hocko Cc: Catalin Marinas Cc: AKASHI Takahiro Cc: Gioh Kim Cc: Heiko Carstens Cc: Yaowei Bai Cc: Wei Yang Cc: Paul Burton Cc: Miles Chen Cc: Vlastimil Babka Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 10 --- mm/memblock.c | 23 ------ mm/page_alloc.c | 183 +++++++++++++++++++++++++++++++++++++---------- 3 files changed, 144 insertions(+), 72 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f92ea7783652..0257aee7ab4b 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -416,21 +416,11 @@ static inline void early_memtest(phys_addr_t start, phys_addr_t end) { } #endif - -extern unsigned long memblock_reserved_memory_within(phys_addr_t start_addr, - phys_addr_t end_addr); #else static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align) { return 0; } - -static inline unsigned long memblock_reserved_memory_within(phys_addr_t start_addr, - phys_addr_t end_addr) -{ - return 0; -} - #endif /* CONFIG_HAVE_MEMBLOCK */ #endif /* __KERNEL__ */ diff --git a/mm/memblock.c b/mm/memblock.c index 48376bd33274..f7f76513a0c5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1750,29 +1750,6 @@ static void __init_memblock memblock_dump(struct memblock_type *type) } } -extern unsigned long __init_memblock -memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) -{ - struct memblock_region *rgn; - unsigned long size = 0; - int idx; - - for_each_memblock_type(idx, (&memblock.reserved), rgn) { - phys_addr_t start, end; - - if (rgn->base + rgn->size < start_addr) - continue; - if (rgn->base > end_addr) - continue; - - start = rgn->base; - end = start + rgn->size; - size += end - start; - } - - return size; -} - void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cf5555df78bd..3183eb2f579c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -292,40 +292,6 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - -/* - * Determine how many pages need to be initialized during early boot - * (non-deferred initialization). - * The value of first_deferred_pfn will be set later, once non-deferred pages - * are initialized, but for now set it ULONG_MAX. - */ -static inline void reset_deferred_meminit(pg_data_t *pgdat) -{ - phys_addr_t start_addr, end_addr; - unsigned long max_pgcnt; - unsigned long reserved; - - /* - * Initialise at least 2G of a node but also take into account that - * two large system hashes that can take up 1GB for 0.25TB/node. - */ - max_pgcnt = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); - - /* - * Compensate the all the memblock reservations (e.g. crash kernel) - * from the initial estimation to make sure we will initialize enough - * memory to boot. - */ - start_addr = PFN_PHYS(pgdat->node_start_pfn); - end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); - reserved = memblock_reserved_memory_within(start_addr, end_addr); - max_pgcnt += PHYS_PFN(reserved); - - pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); - pgdat->first_deferred_pfn = ULONG_MAX; -} - /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { @@ -361,10 +327,6 @@ static inline bool update_defer_init(pg_data_t *pgdat, return true; } #else -static inline void reset_deferred_meminit(pg_data_t *pgdat) -{ -} - static inline bool early_page_uninitialised(unsigned long pfn) { return false; @@ -1611,6 +1573,117 @@ static int __init deferred_init_memmap(void *data) pgdat_init_report_one_done(); return 0; } + +/* + * During boot we initialize deferred pages on-demand, as needed, but once + * page_alloc_init_late() has finished, the deferred pages are all initialized, + * and we can permanently disable that path. + */ +static DEFINE_STATIC_KEY_TRUE(deferred_pages); + +/* + * If this zone has deferred pages, try to grow it by initializing enough + * deferred pages to satisfy the allocation specified by order, rounded up to + * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments + * of SECTION_SIZE bytes by initializing struct pages in increments of + * PAGES_PER_SECTION * sizeof(struct page) bytes. + * + * Return true when zone was grown, otherwise return false. We return true even + * when we grow less than requested, to let the caller decide if there are + * enough pages to satisfy the allocation. + * + * Note: We use noinline because this function is needed only during boot, and + * it is called from a __ref function _deferred_grow_zone. This way we are + * making sure that it is not inlined into permanent text section. + */ +static noinline bool __init +deferred_grow_zone(struct zone *zone, unsigned int order) +{ + int zid = zone_idx(zone); + int nid = zone_to_nid(zone); + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); + unsigned long nr_pages = 0; + unsigned long first_init_pfn, spfn, epfn, t, flags; + unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; + phys_addr_t spa, epa; + u64 i; + + /* Only the last zone may have deferred pages */ + if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) + return false; + + pgdat_resize_lock(pgdat, &flags); + + /* + * If deferred pages have been initialized while we were waiting for + * the lock, return true, as the zone was grown. The caller will retry + * this zone. We won't return to this function since the caller also + * has this static branch. + */ + if (!static_branch_unlikely(&deferred_pages)) { + pgdat_resize_unlock(pgdat, &flags); + return true; + } + + /* + * If someone grew this zone while we were waiting for spinlock, return + * true, as there might be enough pages already. + */ + if (first_deferred_pfn != pgdat->first_deferred_pfn) { + pgdat_resize_unlock(pgdat, &flags); + return true; + } + + first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); + + if (first_init_pfn >= pgdat_end_pfn(pgdat)) { + pgdat_resize_unlock(pgdat, &flags); + return false; + } + + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + + while (spfn < epfn && nr_pages < nr_pages_needed) { + t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); + first_deferred_pfn = min(t, epfn); + nr_pages += deferred_init_pages(nid, zid, spfn, + first_deferred_pfn); + spfn = first_deferred_pfn; + } + + if (nr_pages >= nr_pages_needed) + break; + } + + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); + deferred_free_pages(nid, zid, spfn, epfn); + + if (first_deferred_pfn == epfn) + break; + } + pgdat->first_deferred_pfn = first_deferred_pfn; + pgdat_resize_unlock(pgdat, &flags); + + return nr_pages > 0; +} + +/* + * deferred_grow_zone() is __init, but it is called from + * get_page_from_freelist() during early boot until deferred_pages permanently + * disables this call. This is why we have refdata wrapper to avoid warning, + * and to ensure that the function body gets unloaded. + */ +static bool __ref +_deferred_grow_zone(struct zone *zone, unsigned int order) +{ + return deferred_grow_zone(zone, order); +} + #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void __init page_alloc_init_late(void) @@ -1629,6 +1702,12 @@ void __init page_alloc_init_late(void) /* Block until all are initialised */ wait_for_completion(&pgdat_init_all_done_comp); + /* + * We initialized the rest of the deferred pages. Permanently disable + * on-demand struct page initialization. + */ + static_branch_disable(&deferred_pages); + /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif @@ -3208,6 +3287,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, ac_classzone_idx(ac), alloc_flags)) { int ret; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * Watermark failed for this zone, but see if we can + * grow this zone if it contains deferred pages. + */ + if (static_branch_unlikely(&deferred_pages)) { + if (_deferred_grow_zone(zone, order)) + goto try_this_zone; + } +#endif /* Checked here to keep the fast path fast */ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) @@ -3249,6 +3338,14 @@ try_this_zone: reserve_highatomic_pageblock(page, zone, order); return page; + } else { +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* Try again if zone has deferred pages */ + if (static_branch_unlikely(&deferred_pages)) { + if (_deferred_grow_zone(zone, order)) + goto try_this_zone; + } +#endif } } @@ -6244,7 +6341,15 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, alloc_node_mem_map(pgdat); - reset_deferred_meminit(pgdat); +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * We start only with one section of pages, more pages are added as + * needed until the rest of deferred pages are initialized. + */ + pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, + pgdat->node_spanned_pages); + pgdat->first_deferred_pfn = ULONG_MAX; +#endif free_area_init_core(pgdat); } -- cgit v1.2.3 From f0849ac0b8e072073ec5fcc7fadd05a77434364e Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 5 Apr 2018 16:22:35 -0700 Subject: mm: thp: fix potential clearing to referenced flag in page_idle_clear_pte_refs_one() For PTE-mapped THP, the compound THP has not been split to normal 4K pages yet, the whole THP is considered referenced if any one of sub page is referenced. When walking PTE-mapped THP by pvmw, all relevant PTEs will be checked to retrieve referenced bit. But, the current code just returns the result of the last PTE. If the last PTE has not referenced, the referenced flag will be cleared. Just set referenced when ptep{pmdp}_clear_young_notify() returns true. Link: http://lkml.kernel.org/r/1518212451-87134-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Reported-by: Gang Deng Suggested-by: Kirill A. Shutemov Reviewed-by: Andrew Morton Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_idle.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/page_idle.c b/mm/page_idle.c index 0a49374e6931..e412a63b2b74 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -65,11 +65,15 @@ static bool page_idle_clear_pte_refs_one(struct page *page, while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { - referenced = ptep_clear_young_notify(vma, addr, - pvmw.pte); + /* + * For PTE-mapped THP, one sub page is referenced, + * the whole THP is referenced. + */ + if (ptep_clear_young_notify(vma, addr, pvmw.pte)) + referenced = true; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - referenced = pmdp_clear_young_notify(vma, addr, - pvmw.pmd); + if (pmdp_clear_young_notify(vma, addr, pvmw.pmd)) + referenced = true; } else { /* unexpected pmd-mapped page? */ WARN_ON_ONCE(1); -- cgit v1.2.3 From ba325585230ed17079fd5b4065c359ebba117bbe Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 5 Apr 2018 16:22:39 -0700 Subject: mm/memory_hotplug: enforce block size aligned range check Patch series "optimize memory hotplug", v3. This patchset: - Improves hotplug performance by eliminating a number of struct page traverses during memory hotplug. - Fixes some issues with hotplugging, where boundaries were not properly checked. And on x86 block size was not properly aligned with end of memory - Also, potentially improves boot performance by eliminating condition from __init_single_page(). - Adds robustness by verifying that that struct pages are correctly poisoned when flags are accessed. The following experiments were performed on Xeon(R) CPU E7-8895 v3 @ 2.60GHz with 1T RAM: booting in qemu with 960G of memory, time to initialize struct pages: no-kvm: TRY1 TRY2 BEFORE: 39.433668 39.39705 AFTER: 36.903781 36.989329 with-kvm: BEFORE: 10.977447 11.103164 AFTER: 10.929072 10.751885 Hotplug 896G memory: no-kvm: TRY1 TRY2 BEFORE: 848.740000 846.910000 AFTER: 783.070000 786.560000 with-kvm: TRY1 TRY2 BEFORE: 34.410000 33.57 AFTER: 29.810000 29.580000 This patch (of 6): Start qemu with the following arguments: -m 64G,slots=2,maxmem=66G -object memory-backend-ram,id=mem1,size=2G Which: boots machine with 64G, and adds a device mem1 with 2G which can be hotplugged later. Also make sure that config has the following turned on: CONFIG_MEMORY_HOTPLUG CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE CONFIG_ACPI_HOTPLUG_MEMORY Using the qemu monitor hotplug the memory (make sure config has (qemu) device_add pc-dimm,id=dimm1,memdev=mem1 The operation will fail with the following trace: WARNING: CPU: 0 PID: 91 at drivers/base/memory.c:205 pages_correctly_reserved+0xe6/0x110 Modules linked in: CPU: 0 PID: 91 Comm: systemd-udevd Not tainted 4.16.0-rc1_pt_master #29 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014 RIP: 0010:pages_correctly_reserved+0xe6/0x110 Call Trace: memory_subsys_online+0x44/0xa0 device_online+0x51/0x80 store_mem_state+0x5e/0xe0 kernfs_fop_write+0xfa/0x170 __vfs_write+0x2e/0x150 vfs_write+0xa8/0x1a0 SyS_write+0x4d/0xb0 do_syscall_64+0x5d/0x110 entry_SYSCALL_64_after_hwframe+0x21/0x86 ---[ end trace 6203bc4f1a5d30e8 ]--- The problem is detected in: drivers/base/memory.c static bool pages_correctly_reserved(unsigned long start_pfn) 205 if (WARN_ON_ONCE(!pfn_valid(pfn))) This function loops through every section in the newly added memory block and verifies that the first pfn is valid, meaning section exists, has mapping (struct page array), and is online. The block size on x86 is usually 128M, but when machine is booted with more than 64G of memory, the block size is changed to 2G: $ cat /sys/devices/system/memory/block_size_bytes 80000000 or $ dmesg | grep "block size" [ 0.086469] x86/mm: Memory block size: 2048MB During memory hotplug, and hotremove we verify that the range is section size aligned, but we actually must verify that it is block size aligned, because that is the proper unit for hotplug operations. See: Documentation/memory-hotplug.txt So, when the start_pfn of newly added memory is not block size aligned, we can get a memory block that has only part of it with properly populated sections. In our case the start_pfn starts from the last_pfn (end of physical memory). $ dmesg | grep last_pfn [ 0.000000] e820: last_pfn = 0x1040000 max_arch_pfn = 0x400000000 0x1040000 == 65G, and so is not 2G aligned! The fix is to enforce that memory that is hotplugged and hotremoved is block size aligned. With this fix, running the above sequence yield to the following result: (qemu) device_add pc-dimm,id=dimm1,memdev=mem1 Block size [0x80000000] unaligned hotplug range: start 0x1040000000, size 0x80000000 acpi PNP0C80:00: add_memory failed acpi PNP0C80:00: acpi_memory_enable_device() error acpi PNP0C80:00: Enumeration failure Link: http://lkml.kernel.org/r/20180213193159.14606-2-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Ingo Molnar Acked-by: Michal Hocko Cc: Baoquan He Cc: Bharata B Rao Cc: Daniel Jordan Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Steven Sistare Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b2bd52ff7605..565048f496f7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1083,15 +1083,16 @@ out: static int check_hotplug_memory_range(u64 start, u64 size) { - u64 start_pfn = PFN_DOWN(start); + unsigned long block_sz = memory_block_size_bytes(); + u64 block_nr_pages = block_sz >> PAGE_SHIFT; u64 nr_pages = size >> PAGE_SHIFT; + u64 start_pfn = PFN_DOWN(start); - /* Memory range must be aligned with section */ - if ((start_pfn & ~PAGE_SECTION_MASK) || - (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { - pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", - (unsigned long long)start, - (unsigned long long)size); + /* memory range must be block size aligned */ + if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) || + !IS_ALIGNED(nr_pages, block_nr_pages)) { + pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx", + block_sz, start, size); return -EINVAL; } -- cgit v1.2.3 From 078eb6aa50dc50cd85f09a22226b7e238b3397ce Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 5 Apr 2018 16:22:43 -0700 Subject: x86/mm/memory_hotplug: determine block size based on the end of boot memory Memory sections are combined into "memory block" chunks. These chunks are the units upon which memory can be added and removed. On x86, the new memory may be added after the end of the boot memory, therefore, if block size does not align with end of boot memory, memory hot-plugging/hot-removing can be broken. Memory sections are combined into "memory block" chunks. These chunks are the units upon which memory can be added and removed. On x86 the new memory may be added after the end of the boot memory, therefore, if block size does not align with end of boot memory, memory hotplugging/hotremoving can be broken. Currently, whenever machine is booted with more than 64G the block size is unconditionally increased to 2G from the base 128M. This is done in order to reduce number of memory device files in sysfs: /sys/devices/system/memory/memoryXXX We must use the largest allowed block size that aligns to the next address to be able to hotplug the next block of memory. So, when memory is larger or equal to 64G, we check the end address and find the largest block size that is still power of two but smaller or equal to 2G. Before, the fix: Run qemu with: -m 64G,slots=2,maxmem=66G -object memory-backend-ram,id=mem1,size=2G (qemu) device_add pc-dimm,id=dimm1,memdev=mem1 Block size [0x80000000] unaligned hotplug range: start 0x1040000000, size 0x80000000 acpi PNP0C80:00: add_memory failed acpi PNP0C80:00: acpi_memory_enable_device() error acpi PNP0C80:00: Enumeration failure With the fix memory is added successfully as the block size is set to 1G, and therefore aligns with start address 0x1040000000. [pasha.tatashin@oracle.com: v4] Link: http://lkml.kernel.org/r/20180215165920.8570-3-pasha.tatashin@oracle.com Link: http://lkml.kernel.org/r/20180213193159.14606-3-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Steven Sistare Cc: Daniel Jordan Cc: Mel Gorman Cc: Michal Hocko Cc: Greg Kroah-Hartman Cc: Vlastimil Babka Cc: Bharata B Rao Cc: Dan Williams Cc: Kirill A. Shutemov Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 45241de66785..dca9abf2b85c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1328,14 +1328,39 @@ int kern_addr_valid(unsigned long addr) return pfn_valid(pte_pfn(*pte)); } +/* + * Block size is the minimum amount of memory which can be hotplugged or + * hotremoved. It must be power of two and must be equal or larger than + * MIN_MEMORY_BLOCK_SIZE. + */ +#define MAX_BLOCK_SIZE (2UL << 30) + +/* Amount of ram needed to start using large blocks */ +#define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30) + static unsigned long probe_memory_block_size(void) { - unsigned long bz = MIN_MEMORY_BLOCK_SIZE; + unsigned long boot_mem_end = max_pfn << PAGE_SHIFT; + unsigned long bz; - /* if system is UV or has 64GB of RAM or more, use large blocks */ - if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30))) - bz = 2UL << 30; /* 2GB */ + /* If this is UV system, always set 2G block size */ + if (is_uv_system()) { + bz = MAX_BLOCK_SIZE; + goto done; + } + /* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */ + if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) { + bz = MIN_MEMORY_BLOCK_SIZE; + goto done; + } + + /* Find the largest allowed block size that aligns to memory end */ + for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { + if (IS_ALIGNED(boot_mem_end, bz)) + break; + } +done: pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20); return bz; -- cgit v1.2.3 From f165b378bbdf6c8afd950060fc3cbc935bb890c6 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 5 Apr 2018 16:22:47 -0700 Subject: mm: uninitialized struct page poisoning sanity checking During boot we poison struct page memory in order to ensure that no one is accessing this memory until the struct pages are initialized in __init_single_page(). This patch adds more scrutiny to this checking by making sure that flags do not equal the poison pattern when they are accessed. The pattern is all ones. Since node id is also stored in struct page, and may be accessed quite early, we add this enforcement into page_to_nid() function as well. Note, this is applicable only when NODE_NOT_IN_PAGE_FLAGS=n [pasha.tatashin@oracle.com: v4] Link: http://lkml.kernel.org/r/20180215165920.8570-4-pasha.tatashin@oracle.com Link: http://lkml.kernel.org/r/20180213193159.14606-4-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Ingo Molnar Acked-by: Michal Hocko Cc: Baoquan He Cc: Bharata B Rao Cc: Daniel Jordan Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Steven Sistare Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 +++- include/linux/page-flags.h | 22 +++++++++++++++++----- mm/memblock.c | 2 +- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f945dff34925..2e40a44a1fae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -903,7 +903,9 @@ extern int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { - return (page->flags >> NODES_PGSHIFT) & NODES_MASK; + struct page *p = (struct page *)page; + + return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 50c2b8786831..e34a27727b9a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -156,9 +156,18 @@ static __always_inline int PageCompound(struct page *page) return test_bit(PG_head, &page->flags) || PageTail(page); } +#define PAGE_POISON_PATTERN -1l +static inline int PagePoisoned(const struct page *page) +{ + return page->flags == PAGE_POISON_PATTERN; +} + /* * Page flags policies wrt compound pages * + * PF_POISONED_CHECK + * check if this struct page poisoned/uninitialized + * * PF_ANY: * the page flag is relevant for small, head and tail pages. * @@ -176,17 +185,20 @@ static __always_inline int PageCompound(struct page *page) * PF_NO_COMPOUND: * the page flag is not relevant for compound pages. */ -#define PF_ANY(page, enforce) page -#define PF_HEAD(page, enforce) compound_head(page) +#define PF_POISONED_CHECK(page) ({ \ + VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \ + page; }) +#define PF_ANY(page, enforce) PF_POISONED_CHECK(page) +#define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page)) #define PF_ONLY_HEAD(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(PageTail(page), page); \ - page;}) + PF_POISONED_CHECK(page); }) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ - compound_head(page);}) + PF_POISONED_CHECK(compound_head(page)); }) #define PF_NO_COMPOUND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ - page;}) + PF_POISONED_CHECK(page); }) /* * Macros to create function definitions for page flags diff --git a/mm/memblock.c b/mm/memblock.c index f7f76513a0c5..c2fed302c2f8 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1345,7 +1345,7 @@ void * __init memblock_virt_alloc_try_nid_raw( min_addr, max_addr, nid); #ifdef CONFIG_DEBUG_VM if (ptr && size > 0) - memset(ptr, 0xff, size); + memset(ptr, PAGE_POISON_PATTERN, size); #endif return ptr; } -- cgit v1.2.3 From b77eab7079d9e477489d2416cceda05d3c1cf21f Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 5 Apr 2018 16:22:52 -0700 Subject: mm/memory_hotplug: optimize probe routine When memory is hotplugged pages_correctly_reserved() is called to verify that the added memory is present, this routine traverses through every struct page and verifies that PageReserved() is set. This is a slow operation especially if a large amount of memory is added. Instead of checking every page, it is enough to simply check that the section is present, has mapping (struct page array is allocated), and the mapping is online. In addition, we should not excpect that probe routine sets flags in struct page, as the struct pages have not yet been initialized. The initialization should be done in __init_single_page(), the same as during boot. Link: http://lkml.kernel.org/r/20180215165920.8570-5-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Ingo Molnar Cc: Baoquan He Cc: Bharata B Rao Cc: Daniel Jordan Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Steven Sistare Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index fe4b24f05f6a..deb3f029b451 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -187,13 +187,14 @@ int memory_isolate_notify(unsigned long val, void *v) } /* - * The probe routines leave the pages reserved, just as the bootmem code does. - * Make sure they're still that way. + * The probe routines leave the pages uninitialized, just as the bootmem code + * does. Make sure we do not access them, but instead use only information from + * within sections. */ -static bool pages_correctly_reserved(unsigned long start_pfn) +static bool pages_correctly_probed(unsigned long start_pfn) { - int i, j; - struct page *page; + unsigned long section_nr = pfn_to_section_nr(start_pfn); + unsigned long section_nr_end = section_nr + sections_per_block; unsigned long pfn = start_pfn; /* @@ -201,21 +202,24 @@ static bool pages_correctly_reserved(unsigned long start_pfn) * SPARSEMEM_VMEMMAP. We lookup the page once per section * and assume memmap is contiguous within each section */ - for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { + for (; section_nr < section_nr_end; section_nr++) { if (WARN_ON_ONCE(!pfn_valid(pfn))) return false; - page = pfn_to_page(pfn); - - for (j = 0; j < PAGES_PER_SECTION; j++) { - if (PageReserved(page + j)) - continue; - - printk(KERN_WARNING "section number %ld page number %d " - "not reserved, was it already online?\n", - pfn_to_section_nr(pfn), j); + if (!present_section_nr(section_nr)) { + pr_warn("section %ld pfn[%lx, %lx) not present", + section_nr, pfn, pfn + PAGES_PER_SECTION); + return false; + } else if (!valid_section_nr(section_nr)) { + pr_warn("section %ld pfn[%lx, %lx) no valid memmap", + section_nr, pfn, pfn + PAGES_PER_SECTION); + return false; + } else if (online_section_nr(section_nr)) { + pr_warn("section %ld pfn[%lx, %lx) is already online", + section_nr, pfn, pfn + PAGES_PER_SECTION); return false; } + pfn += PAGES_PER_SECTION; } return true; @@ -237,7 +241,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t switch (action) { case MEM_ONLINE: - if (!pages_correctly_reserved(start_pfn)) + if (!pages_correctly_probed(start_pfn)) return -EBUSY; ret = online_pages(start_pfn, nr_pages, online_type); -- cgit v1.2.3 From fc44f7f9231a73821fc858f5bc48883a9e78f6de Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 5 Apr 2018 16:22:56 -0700 Subject: mm/memory_hotplug: don't read nid from struct page during hotplug During memory hotplugging the probe routine will leave struct pages uninitialized, the same as it is currently done during boot. Therefore, we do not want to access the inside of struct pages before __init_single_page() is called during onlining. Because during hotplug we know that pages in one memory block belong to the same numa node, we can skip the checking. We should keep checking for the boot case. [pasha.tatashin@oracle.com: s/register_new_memory()/hotplug_memory_register()] Link: http://lkml.kernel.org/r/20180228030308.1116-6-pasha.tatashin@oracle.com Link: http://lkml.kernel.org/r/20180215165920.8570-6-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Reviewed-by: Ingo Molnar Cc: Baoquan He Cc: Bharata B Rao Cc: Daniel Jordan Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Steven Sistare Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 4 ++-- drivers/base/node.c | 22 +++++++++++++++------- include/linux/memory.h | 2 +- include/linux/node.h | 4 ++-- mm/memory_hotplug.c | 2 +- 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index deb3f029b451..79fcd2bae96b 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -712,7 +712,7 @@ static int add_memory_block(int base_section_nr) * need an interface for the VM to add new memory regions, * but without onlining it. */ -int register_new_memory(int nid, struct mem_section *section) +int hotplug_memory_register(int nid, struct mem_section *section) { int ret = 0; struct memory_block *mem; @@ -731,7 +731,7 @@ int register_new_memory(int nid, struct mem_section *section) } if (mem->section_count == sections_per_block) - ret = register_mem_sect_under_node(mem, nid); + ret = register_mem_sect_under_node(mem, nid, false); out: mutex_unlock(&mem_sysfs_mutex); return ret; diff --git a/drivers/base/node.c b/drivers/base/node.c index c5f81fc621ac..92b00a7e6a02 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -399,7 +399,8 @@ static int __ref get_nid_for_pfn(unsigned long pfn) } /* register memory section under specified node if it spans that node */ -int register_mem_sect_under_node(struct memory_block *mem_blk, int nid) +int register_mem_sect_under_node(struct memory_block *mem_blk, int nid, + bool check_nid) { int ret; unsigned long pfn, sect_start_pfn, sect_end_pfn; @@ -425,11 +426,18 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid) continue; } - page_nid = get_nid_for_pfn(pfn); - if (page_nid < 0) - continue; - if (page_nid != nid) - continue; + /* + * We need to check if page belongs to nid only for the boot + * case, during hotplug we know that all pages in the memory + * block belong to the same node. + */ + if (check_nid) { + page_nid = get_nid_for_pfn(pfn); + if (page_nid < 0) + continue; + if (page_nid != nid) + continue; + } ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, &mem_blk->dev.kobj, kobject_name(&mem_blk->dev.kobj)); @@ -504,7 +512,7 @@ int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages) mem_blk = find_memory_block_hinted(mem_sect, mem_blk); - ret = register_mem_sect_under_node(mem_blk, nid); + ret = register_mem_sect_under_node(mem_blk, nid, true); if (!err) err = ret; diff --git a/include/linux/memory.h b/include/linux/memory.h index f71e732c77b2..9f8cd856ca1e 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -109,7 +109,7 @@ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); -extern int register_new_memory(int, struct mem_section *); +int hotplug_memory_register(int nid, struct mem_section *section); #ifdef CONFIG_MEMORY_HOTREMOVE extern int unregister_memory_section(struct mem_section *); #endif diff --git a/include/linux/node.h b/include/linux/node.h index 4ece0fee0ffc..41f171861dcc 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -67,7 +67,7 @@ extern void unregister_one_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); extern int register_mem_sect_under_node(struct memory_block *mem_blk, - int nid); + int nid, bool check_nid); extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, unsigned long phys_index); @@ -97,7 +97,7 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) return 0; } static inline int register_mem_sect_under_node(struct memory_block *mem_blk, - int nid) + int nid, bool check_nid) { return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 565048f496f7..477e183a4ac7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -279,7 +279,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, if (!want_memblock) return 0; - return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); + return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn)); } /* -- cgit v1.2.3 From d0dc12e86b3197a14a908d4fe7cb35b73dda82b5 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 5 Apr 2018 16:23:00 -0700 Subject: mm/memory_hotplug: optimize memory hotplug During memory hotplugging we traverse struct pages three times: 1. memset(0) in sparse_add_one_section() 2. loop in __add_section() to set do: set_page_node(page, nid); and SetPageReserved(page); 3. loop in memmap_init_zone() to call __init_single_pfn() This patch removes the first two loops, and leaves only loop 3. All struct pages are initialized in one place, the same as it is done during boot. The benefits: - We improve memory hotplug performance because we are not evicting the cache several times and also reduce loop branching overhead. - Remove condition from hotpath in __init_single_pfn(), that was added in order to fix the problem that was reported by Bharata in the above email thread, thus also improve performance during normal boot. - Make memory hotplug more similar to the boot memory initialization path because we zero and initialize struct pages only in one function. - Simplifies memory hotplug struct page initialization code, and thus enables future improvements, such as multi-threading the initialization of struct pages in order to improve hotplug performance even further on larger machines. [pasha.tatashin@oracle.com: v5] Link: http://lkml.kernel.org/r/20180228030308.1116-7-pasha.tatashin@oracle.com Link: http://lkml.kernel.org/r/20180215165920.8570-7-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Ingo Molnar Cc: Michal Hocko Cc: Baoquan He Cc: Bharata B Rao Cc: Daniel Jordan Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Steven Sistare Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 2 ++ include/linux/memory.h | 1 + mm/memory_hotplug.c | 27 ++++++++------------------- mm/page_alloc.c | 28 ++++++++++------------------ mm/sparse.c | 8 +++++++- 5 files changed, 28 insertions(+), 38 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 92b00a7e6a02..7a3a580821e0 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -407,6 +407,8 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid, if (!mem_blk) return -EFAULT; + + mem_blk->nid = nid; if (!node_online(nid)) return 0; diff --git a/include/linux/memory.h b/include/linux/memory.h index 9f8cd856ca1e..31ca3e28b0eb 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -33,6 +33,7 @@ struct memory_block { void *hw; /* optional pointer to fw/hw data */ int (*phys_callback)(struct memory_block *); struct device dev; + int nid; /* NID for this memory block */ }; int arch_get_memory_phys_device(unsigned long start_pfn); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 477e183a4ac7..6a9ba14e18ed 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -250,7 +250,6 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, struct vmem_altmap *altmap, bool want_memblock) { int ret; - int i; if (pfn_valid(phys_start_pfn)) return -EEXIST; @@ -259,23 +258,6 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, if (ret < 0) return ret; - /* - * Make all the pages reserved so that nobody will stumble over half - * initialized state. - * FIXME: We also have to associate it with a node because page_to_nid - * relies on having page with the proper node. - */ - for (i = 0; i < PAGES_PER_SECTION; i++) { - unsigned long pfn = phys_start_pfn + i; - struct page *page; - if (!pfn_valid(pfn)) - continue; - - page = pfn_to_page(pfn); - set_page_node(page, nid); - SetPageReserved(page); - } - if (!want_memblock) return 0; @@ -908,8 +890,15 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; + struct memory_block *mem; + + /* + * We can't use pfn_to_nid() because nid might be stored in struct page + * which is not yet initialized. Instead, we find nid from memory block. + */ + mem = find_memory_block(__pfn_to_section(pfn)); + nid = mem->nid; - nid = pfn_to_nid(pfn); /* associate pfn range with the zone */ zone = move_pfn_range(online_type, nid, pfn, nr_pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3183eb2f579c..a3e2ba4f76bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1143,10 +1143,9 @@ static void free_one_page(struct zone *zone, } static void __meminit __init_single_page(struct page *page, unsigned long pfn, - unsigned long zone, int nid, bool zero) + unsigned long zone, int nid) { - if (zero) - mm_zero_struct_page(page); + mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); @@ -1160,12 +1159,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, #endif } -static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, - int nid, bool zero) -{ - return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero); -} - #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static void __meminit init_reserved_page(unsigned long pfn) { @@ -1184,7 +1177,7 @@ static void __meminit init_reserved_page(unsigned long pfn) if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) break; } - __init_single_pfn(pfn, zid, nid, true); + __init_single_page(pfn_to_page(pfn), pfn, zid, nid); } #else static inline void init_reserved_page(unsigned long pfn) @@ -1501,7 +1494,7 @@ static unsigned long __init deferred_init_pages(int nid, int zid, } else { page++; } - __init_single_page(page, pfn, zid, nid, true); + __init_single_page(page, pfn, zid, nid); nr_pages++; } return (nr_pages); @@ -5434,6 +5427,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; unsigned long nr_initialised = 0; + struct page *page; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP struct memblock_region *r = NULL, *tmp; #endif @@ -5486,6 +5480,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, #endif not_early: + page = pfn_to_page(pfn); + __init_single_page(page, pfn, zone, nid); + if (context == MEMMAP_HOTPLUG) + SetPageReserved(page); + /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations @@ -5502,15 +5501,8 @@ not_early: * because this is done early in sparse_add_one_section */ if (!(pfn & (pageblock_nr_pages - 1))) { - struct page *page = pfn_to_page(pfn); - - __init_single_page(page, pfn, zone, nid, - context != MEMMAP_HOTPLUG); set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); - } else { - __init_single_pfn(pfn, zone, nid, - context != MEMMAP_HOTPLUG); } } } diff --git a/mm/sparse.c b/mm/sparse.c index 58cab483e81b..62eef264a7bd 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -779,7 +779,13 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, goto out; } - memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); +#ifdef CONFIG_DEBUG_VM + /* + * Poison uninitialized struct pages in order to catch invalid flags + * combinations. + */ + memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION); +#endif section_mark_present(ms); -- cgit v1.2.3 From 31286a8484a85e8b4e91ddb0f5415aee8a416827 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 5 Apr 2018 16:23:05 -0700 Subject: mm: hwpoison: disable memory error handling on 1GB hugepage Recently the following BUG was reported: Injecting memory failure for pfn 0x3c0000 at process virtual address 0x7fe300000000 Memory failure: 0x3c0000: recovery action for huge page: Recovered BUG: unable to handle kernel paging request at ffff8dfcc0003000 IP: gup_pgd_range+0x1f0/0xc20 PGD 17ae72067 P4D 17ae72067 PUD 0 Oops: 0000 [#1] SMP PTI ... CPU: 3 PID: 5467 Comm: hugetlb_1gb Not tainted 4.15.0-rc8-mm1-abc+ #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.fc25 04/01/2014 You can easily reproduce this by calling madvise(MADV_HWPOISON) twice on a 1GB hugepage. This happens because get_user_pages_fast() is not aware of a migration entry on pud that was created in the 1st madvise() event. I think that conversion to pud-aligned migration entry is working, but other MM code walking over page table isn't prepared for it. We need some time and effort to make all this work properly, so this patch avoids the reported bug by just disabling error handling for 1GB hugepage. [n-horiguchi@ah.jp.nec.com: v2] Link: http://lkml.kernel.org/r/1517284444-18149-1-git-send-email-n-horiguchi@ah.jp.nec.com Link: http://lkml.kernel.org/r/1517207283-15769-1-git-send-email-n-horiguchi@ah.jp.nec.com Signed-off-by: Naoya Horiguchi Acked-by: Michal Hocko Reviewed-by: Andrew Morton Reviewed-by: Mike Kravetz Acked-by: Punit Agrawal Tested-by: Michael Ellerman Cc: Anshuman Khandual Cc: "Aneesh Kumar K.V" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/memory-failure.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e40a44a1fae..2e2be527642a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2613,6 +2613,7 @@ enum mf_action_page_type { MF_MSG_POISONED_HUGE, MF_MSG_HUGE, MF_MSG_FREE_HUGE, + MF_MSG_NON_PMD_HUGE, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8291b75f42c8..2d4bf647cf01 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -502,6 +502,7 @@ static const char * const action_page_types[] = { [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", + [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", [MF_MSG_UNMAP_FAILED] = "unmapping failed page", [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", @@ -1084,6 +1085,21 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) return 0; } + /* + * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so + * simply disable it. In order to make it work properly, we need + * make sure that: + * - conversion of a pud that maps an error hugetlb into hwpoison + * entry properly works, and + * - other mm code walking over page table is aware of pud-aligned + * hwpoison entries. + */ + if (huge_page_size(page_hstate(head)) > PMD_SIZE) { + action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED); + res = -EBUSY; + goto out; + } + if (!hwpoison_user_mappings(p, pfn, flags, &head)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; -- cgit v1.2.3 From a5c6d6509342785bef53bf9508e1842b303f1878 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Apr 2018 16:23:09 -0700 Subject: mm, page_alloc: extend kernelcore and movablecore for percent Both kernelcore= and movablecore= can be used to define the amount of ZONE_NORMAL and ZONE_MOVABLE on a system, respectively. This requires the system memory capacity to be known when specifying the command line, however. This introduces the ability to define both kernelcore= and movablecore= as a percentage of total system memory. This is convenient for systems software that wants to define the amount of ZONE_MOVABLE, for example, as a proportion of a system's memory rather than a hardcoded byte value. To define the percentage, the final character of the parameter should be a '%'. mhocko: "why is anyone using these options nowadays?" rientjes: : : Fragmentation of non-__GFP_MOVABLE pages due to low on memory : situations can pollute most pageblocks on the system, as much as 1GB of : slab being fragmented over 128GB of memory, for example. When the : amount of kernel memory is well bounded for certain systems, it is : better to aggressively reclaim from existing MIGRATE_UNMOVABLE : pageblocks rather than eagerly fallback to others. : : We have additional patches that help with this fragmentation if you're : interested, specifically kcompactd compaction of MIGRATE_UNMOVABLE : pageblocks triggered by fallback of non-__GFP_MOVABLE allocations and : draining of pcp lists back to the zone free area to prevent stranding. [rientjes@google.com: updates] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1802131700160.71590@chino.kir.corp.google.com Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1802121622470.179479@chino.kir.corp.google.com Signed-off-by: David Rientjes Reviewed-by: Andrew Morton Reviewed-by: Mike Kravetz Cc: Jonathan Corbet Cc: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 54 ++++++++++++------------- mm/page_alloc.c | 43 ++++++++++++++++---- 2 files changed, 62 insertions(+), 35 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5ffe4c4121bd..7993021a1f6e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1840,30 +1840,29 @@ keepinitrd [HW,ARM] kernelcore= [KNL,X86,IA-64,PPC] - Format: nn[KMGTPE] | "mirror" - This parameter - specifies the amount of memory usable by the kernel - for non-movable allocations. The requested amount is - spread evenly throughout all nodes in the system. The - remaining memory in each node is used for Movable - pages. In the event, a node is too small to have both - kernelcore and Movable pages, kernelcore pages will - take priority and other nodes will have a larger number - of Movable pages. The Movable zone is used for the - allocation of pages that may be reclaimed or moved - by the page migration subsystem. This means that - HugeTLB pages may not be allocated from this zone. - Note that allocations like PTEs-from-HighMem still - use the HighMem zone if it exists, and the Normal + Format: nn[KMGTPE] | nn% | "mirror" + This parameter specifies the amount of memory usable by + the kernel for non-movable allocations. The requested + amount is spread evenly throughout all nodes in the + system as ZONE_NORMAL. The remaining memory is used for + movable memory in its own zone, ZONE_MOVABLE. In the + event, a node is too small to have both ZONE_NORMAL and + ZONE_MOVABLE, kernelcore memory will take priority and + other nodes will have a larger ZONE_MOVABLE. + + ZONE_MOVABLE is used for the allocation of pages that + may be reclaimed or moved by the page migration + subsystem. Note that allocations like PTEs-from-HighMem + still use the HighMem zone if it exists, and the Normal zone if it does not. - Instead of specifying the amount of memory (nn[KMGTPE]), - you can specify "mirror" option. In case "mirror" + It is possible to specify the exact amount of memory in + the form of "nn[KMGTPE]", a percentage of total system + memory in the form of "nn%", or "mirror". If "mirror" option is specified, mirrored (reliable) memory is used for non-movable allocations and remaining memory is used - for Movable pages. nn[KMGTPE] and "mirror" are exclusive, - so you can NOT specify nn[KMGTPE] and "mirror" at the same - time. + for Movable pages. "nn[KMGTPE]", "nn%", and "mirror" + are exclusive, so you cannot specify multiple forms. kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: [,poll interval] @@ -2377,13 +2376,14 @@ mousedev.yres= [MOUSE] Vertical screen resolution, used for devices reporting absolute coordinates, such as tablets - movablecore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter - is similar to kernelcore except it specifies the - amount of memory used for migratable allocations. - If both kernelcore and movablecore is specified, - then kernelcore will be at *least* the specified - value but may be more. If movablecore on its own - is specified, the administrator must be careful + movablecore= [KNL,X86,IA-64,PPC] + Format: nn[KMGTPE] | nn% + This parameter is the complement to kernelcore=, it + specifies the amount of memory used for migratable + allocations. If both kernelcore and movablecore is + specified, then kernelcore will be at *least* the + specified value but may be more. If movablecore on its + own is specified, the administrator must be careful that the amount of memory usable for all allocations is not too small. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a3e2ba4f76bb..766ffb5fa94b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -273,7 +273,9 @@ static unsigned long __meminitdata dma_reserve; static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; static unsigned long __initdata required_kernelcore; +static unsigned long required_kernelcore_percent __initdata; static unsigned long __initdata required_movablecore; +static unsigned long required_movablecore_percent __initdata; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; static bool mirrored_kernelcore; @@ -6571,7 +6573,18 @@ static void __init find_zone_movable_pfns_for_nodes(void) } /* - * If movablecore=nn[KMG] was specified, calculate what size of + * If kernelcore=nn% or movablecore=nn% was specified, calculate the + * amount of necessary memory. + */ + if (required_kernelcore_percent) + required_kernelcore = (totalpages * 100 * required_kernelcore_percent) / + 10000UL; + if (required_movablecore_percent) + required_movablecore = (totalpages * 100 * required_movablecore_percent) / + 10000UL; + + /* + * If movablecore= was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore * and movablecore are specified, then the value of kernelcore @@ -6811,18 +6824,30 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) zero_resv_unavail(); } -static int __init cmdline_parse_core(char *p, unsigned long *core) +static int __init cmdline_parse_core(char *p, unsigned long *core, + unsigned long *percent) { unsigned long long coremem; + char *endptr; + if (!p) return -EINVAL; - coremem = memparse(p, &p); - *core = coremem >> PAGE_SHIFT; + /* Value may be a percentage of total memory, otherwise bytes */ + coremem = simple_strtoull(p, &endptr, 0); + if (*endptr == '%') { + /* Paranoid check for percent values greater than 100 */ + WARN_ON(coremem > 100); - /* Paranoid check that UL is enough for the coremem value */ - WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + *percent = coremem; + } else { + coremem = memparse(p, &p); + /* Paranoid check that UL is enough for the coremem value */ + WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + *core = coremem >> PAGE_SHIFT; + *percent = 0UL; + } return 0; } @@ -6838,7 +6863,8 @@ static int __init cmdline_parse_kernelcore(char *p) return 0; } - return cmdline_parse_core(p, &required_kernelcore); + return cmdline_parse_core(p, &required_kernelcore, + &required_kernelcore_percent); } /* @@ -6847,7 +6873,8 @@ static int __init cmdline_parse_kernelcore(char *p) */ static int __init cmdline_parse_movablecore(char *p) { - return cmdline_parse_core(p, &required_movablecore); + return cmdline_parse_core(p, &required_movablecore, + &required_movablecore_percent); } early_param("kernelcore", cmdline_parse_kernelcore); -- cgit v1.2.3 From 7f16f91fdfce694493e539bff17acaedaebc366d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Apr 2018 16:23:12 -0700 Subject: mm, page_alloc: move mirrored_kernelcore to __meminitdata mirrored_kernelcore can be in __meminitdata, so move it there. At the same time, fixup section specifiers to be after the name of the variable per checkpatch. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1802121623280.179479@chino.kir.corp.google.com Signed-off-by: David Rientjes Reviewed-by: Andrew Morton Cc: Mike Kravetz Cc: Jonathan Corbet Cc: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 766ffb5fa94b..86b7f0430e02 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -265,19 +265,19 @@ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; int watermark_scale_factor = 10; -static unsigned long __meminitdata nr_kernel_pages; -static unsigned long __meminitdata nr_all_pages; -static unsigned long __meminitdata dma_reserve; +static unsigned long nr_kernel_pages __meminitdata; +static unsigned long nr_all_pages __meminitdata; +static unsigned long dma_reserve __meminitdata; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; -static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; -static unsigned long __initdata required_kernelcore; +static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata; +static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata; +static unsigned long required_kernelcore __initdata; static unsigned long required_kernelcore_percent __initdata; -static unsigned long __initdata required_movablecore; +static unsigned long required_movablecore __initdata; static unsigned long required_movablecore_percent __initdata; -static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; -static bool mirrored_kernelcore; +static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata; +static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; -- cgit v1.2.3 From 5ad3509364a86461188184e2ae7ca517dca6f389 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 5 Apr 2018 16:23:16 -0700 Subject: mm: reuse DEFINE_SHOW_ATTRIBUTE() macro ...instead of open coding file operations followed by custom ->open() callbacks per each attribute. [andriy.shevchenko@linux.intel.com: add tags, fix compilation issue] Link: http://lkml.kernel.org/r/20180217144253.58604-1-andriy.shevchenko@linux.intel.com Link: http://lkml.kernel.org/r/20180214154644.54505-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Matthew Wilcox Reviewed-by: Andrew Morton Reviewed-by: Sergey Senozhatsky Acked-by: Christoph Lameter Cc: Tejun Heo Cc: Dennis Zhou Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 13 +------------ mm/memblock.c | 13 +------------ mm/percpu-stats.c | 13 +------------ mm/zsmalloc.c | 15 ++------------- 4 files changed, 5 insertions(+), 49 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b5f940ce0143..fac66abd5a68 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -100,18 +100,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) return 0; } - -static int bdi_debug_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, bdi_debug_stats_show, inode->i_private); -} - -static const struct file_operations bdi_debug_stats_fops = { - .open = bdi_debug_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); static int bdi_debug_register(struct backing_dev_info *bdi, const char *name) { diff --git a/mm/memblock.c b/mm/memblock.c index c2fed302c2f8..04406a930bfc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1795,18 +1795,7 @@ static int memblock_debug_show(struct seq_file *m, void *private) } return 0; } - -static int memblock_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, memblock_debug_show, inode->i_private); -} - -static const struct file_operations memblock_debug_fops = { - .open = memblock_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(memblock_debug); static int __init memblock_init_debugfs(void) { diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 7a58460bfd27..063ff60ecd90 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -223,18 +223,7 @@ alloc_buffer: return 0; } - -static int percpu_stats_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, percpu_stats_show, NULL); -} - -static const struct file_operations percpu_stats_fops = { - .open = percpu_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(percpu_stats); static int __init init_percpu_stats_debugfs(void) { diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b7f61cd1c709..a583ab111a43 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -642,18 +642,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) return 0; } - -static int zs_stats_size_open(struct inode *inode, struct file *file) -{ - return single_open(file, zs_stats_size_show, inode->i_private); -} - -static const struct file_operations zs_stat_size_ops = { - .open = zs_stats_size_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(zs_stats_size); static void zs_pool_stat_create(struct zs_pool *pool, const char *name) { @@ -672,7 +661,7 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name) pool->stat_dentry = entry; entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, - pool->stat_dentry, pool, &zs_stat_size_ops); + pool->stat_dentry, pool, &zs_stats_size_fops); if (!entry) { pr_warn("%s: debugfs file entry <%s> creation failed\n", name, "classes"); -- cgit v1.2.3 From e92bb4dd9673945179b1fc738c9817dd91bfb629 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 5 Apr 2018 16:23:20 -0700 Subject: mm: fix races between address_space dereference and free in page_evicatable When page_mapping() is called and the mapping is dereferenced in page_evicatable() through shrink_active_list(), it is possible for the inode to be truncated and the embedded address space to be freed at the same time. This may lead to the following race. CPU1 CPU2 truncate(inode) shrink_active_list() ... page_evictable(page) truncate_inode_page(mapping, page); delete_from_page_cache(page) spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(page, NULL) page_cache_tree_delete(..) ... mapping = page_mapping(page); page->mapping = NULL; ... spin_unlock_irqrestore(&mapping->tree_lock, flags); page_cache_free_page(mapping, page) put_page(page) if (put_page_testzero(page)) -> false - inode now has no pages and can be freed including embedded address_space mapping_unevictable(mapping) test_bit(AS_UNEVICTABLE, &mapping->flags); - we've dereferenced mapping which is potentially already free. Similar race exists between swap cache freeing and page_evicatable() too. The address_space in inode and swap cache will be freed after a RCU grace period. So the races are fixed via enclosing the page_mapping() and address_space usage in rcu_read_lock/unlock(). Some comments are added in code to make it clear what is protected by the RCU read lock. Link: http://lkml.kernel.org/r/20180212081227.1940-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Jan Kara Reviewed-by: Andrew Morton Cc: Mel Gorman Cc: Minchan Kim Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index cd5dc3faaa57..ca566640c448 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3877,7 +3877,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) */ int page_evictable(struct page *page) { - return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); + int ret; + + /* Prevent address_space of inode and swap cache from being freed */ + rcu_read_lock(); + ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); + rcu_read_unlock(); + return ret; } #ifdef CONFIG_SHMEM -- cgit v1.2.3 From 03f5d58fa42fb337b921e57f8e2c2d4df7df890d Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 5 Apr 2018 16:23:24 -0700 Subject: mm/page_ref: use atomic_set_release in page_ref_unfreeze page_ref_unfreeze() has exactly that semantic. No functional changes: just minus one barrier and proper handling of PPro errata. Link: http://lkml.kernel.org/r/151844393004.210639.4672319312617954272.stgit@buzz Signed-off-by: Konstantin Khlebnikov Acked-by: Kirill A. Shutemov Cc: Michal Hocko Cc: Nicholas Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ref.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 760d74a0e9a9..14d14beb1f7f 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -175,8 +175,7 @@ static inline void page_ref_unfreeze(struct page *page, int count) VM_BUG_ON_PAGE(page_count(page) != 0, page); VM_BUG_ON(count == 0); - smp_mb(); - atomic_set(&page->_refcount, count); + atomic_set_release(&page->_refcount, count); if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze)) __page_ref_unfreeze(page, count); } -- cgit v1.2.3 From 605ca5ede7643a01f4c4a15913f9714ac297f8a6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 5 Apr 2018 16:23:28 -0700 Subject: mm/huge_memory.c: reorder operations in __split_huge_page_tail() THP split makes non-atomic change of tail page flags. This is almost ok because tail pages are locked and isolated but this breaks recent changes in page locking: non-atomic operation could clear bit PG_waiters. As a result concurrent sequence get_page_unless_zero() -> lock_page() might block forever. Especially if this page was truncated later. Fix is trivial: clone flags before unfreezing page reference counter. This race exists since commit 62906027091f ("mm: add PageWaiters indicating tasks are waiting for a page bit") while unsave unfreeze itself was added in commit 8df651c7059e ("thp: cleanup split_huge_page()"). clear_compound_head() also must be called before unfreezing page reference because after successful get_page_unless_zero() might follow put_page() which needs correct compound_head(). And replace page_ref_inc()/page_ref_add() with page_ref_unfreeze() which is made especially for that and has semantic of smp_store_release(). Link: http://lkml.kernel.org/r/151844393341.210639.13162088407980624477.stgit@buzz Signed-off-by: Konstantin Khlebnikov Acked-by: Kirill A. Shutemov Cc: Michal Hocko Cc: Nicholas Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5a68730eebd6..f0ae8d1d4329 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2356,26 +2356,13 @@ static void __split_huge_page_tail(struct page *head, int tail, struct page *page_tail = head + tail; VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); - VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* - * tail_page->_refcount is zero and not changing from under us. But - * get_page_unless_zero() may be running from under us on the - * tail_page. If we used atomic_set() below instead of atomic_inc() or - * atomic_add(), we would then run atomic_set() concurrently with - * get_page_unless_zero(), and atomic_set() is implemented in C not - * using locked ops. spin_unlock on x86 sometime uses locked ops - * because of PPro errata 66, 92, so unless somebody can guarantee - * atomic_set() here would be safe on all archs (and not only on x86), - * it's safer to use atomic_inc()/atomic_add(). + * Clone page flags before unfreezing refcount. + * + * After successful get_page_unless_zero() might follow flags change, + * for exmaple lock_page() which set PG_waiters. */ - if (PageAnon(head) && !PageSwapCache(head)) { - page_ref_inc(page_tail); - } else { - /* Additional pin to radix tree */ - page_ref_add(page_tail, 2); - } - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & ((1L << PG_referenced) | @@ -2388,14 +2375,21 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_unevictable) | (1L << PG_dirty))); - /* - * After clearing PageTail the gup refcount can be released. - * Page flags also must be visible before we make the page non-compound. - */ + /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); + /* + * Clear PageTail before unfreezing page refcount. + * + * After successful get_page_unless_zero() might follow put_page() + * which needs correct compound_head(). + */ clear_compound_head(page_tail); + /* Finally unfreeze refcount. Additional reference from page cache. */ + page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || + PageSwapCache(head))); + if (page_is_young(head)) set_page_young(page_tail); if (page_is_idle(head)) -- cgit v1.2.3 From 5c9bab592f53328b3e00d7293378b714abf47a2a Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Thu, 5 Apr 2018 16:23:32 -0700 Subject: z3fold: limit use of stale list for allocation Currently if z3fold couldn't find an unbuddied page it would first try to pull a page off the stale list. The problem with this approach is that we can't 100% guarantee that the page is not processed by the workqueue thread at the same time unless we run cancel_work_sync() on it, which we can't do if we're in an atomic context. So let's just limit stale list usage to non-atomic contexts only. Link: http://lkml.kernel.org/r/47ab51e7-e9c1-d30e-ab17-f734dbc3abce@gmail.com Signed-off-by: Vitaly Vul Reviewed-by: Andrew Morton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index d589d318727f..f579ad4a8100 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -620,24 +620,27 @@ lookup: bud = FIRST; } - spin_lock(&pool->stale_lock); - zhdr = list_first_entry_or_null(&pool->stale, - struct z3fold_header, buddy); - /* - * Before allocating a page, let's see if we can take one from the - * stale pages list. cancel_work_sync() can sleep so we must make - * sure it won't be called in case we're in atomic context. - */ - if (zhdr && (can_sleep || !work_pending(&zhdr->work))) { - list_del(&zhdr->buddy); - spin_unlock(&pool->stale_lock); - if (can_sleep) + page = NULL; + if (can_sleep) { + spin_lock(&pool->stale_lock); + zhdr = list_first_entry_or_null(&pool->stale, + struct z3fold_header, buddy); + /* + * Before allocating a page, let's see if we can take one from + * the stale pages list. cancel_work_sync() can sleep so we + * limit this case to the contexts where we can sleep + */ + if (zhdr) { + list_del(&zhdr->buddy); + spin_unlock(&pool->stale_lock); cancel_work_sync(&zhdr->work); - page = virt_to_page(zhdr); - } else { - spin_unlock(&pool->stale_lock); - page = alloc_page(gfp); + page = virt_to_page(zhdr); + } else { + spin_unlock(&pool->stale_lock); + } } + if (!page) + page = alloc_page(gfp); if (!page) return -ENOMEM; -- cgit v1.2.3 From e830c63a621e20894a663351b968706bd0efbbd0 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 5 Apr 2018 16:23:35 -0700 Subject: mm,vmscan: don't pretend forward progress upon shrinker_rwsem contention Since we no longer use return value of shrink_slab() for normal reclaim, the comment is no longer true. If some do_shrink_slab() call takes unexpectedly long (root cause of stall is currently unknown) when register_shrinker()/unregister_shrinker() is pending, trying to drop caches via /proc/sys/vm/drop_caches could become infinite cond_resched() loop if many mem_cgroup are defined. For safety, let's not pretend forward progress. Link: http://lkml.kernel.org/r/201802202229.GGF26507.LVFtMSOOHFJOQF@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Dave Chinner Cc: Glauber Costa Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index ca566640c448..976be140a8ce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -442,16 +442,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) return 0; - if (!down_read_trylock(&shrinker_rwsem)) { - /* - * If we would return 0, our callers would understand that we - * have nothing else to shrink and give up trying. By returning - * 1 we keep it going and assume we'll be able to shrink next - * time. - */ - freed = 1; + if (!down_read_trylock(&shrinker_rwsem)) goto out; - } list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { -- cgit v1.2.3 From eaf649ebc3acfbb235ce31cebd06e4876d05758e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 5 Apr 2018 16:23:39 -0700 Subject: mm: swap: clean up swap readahead When I see recent change of swap readahead, I am very unhappy about current code structure which diverges two swap readahead algorithm in do_swap_page. This patch is to clean it up. Main motivation is that fault handler doesn't need to be aware of readahead algorithms but just should call swapin_readahead. As first step, this patch cleans up a little bit but not perfect (I just separate for review easier) so next patch will make the goal complete. [minchan@kernel.org: do not check readahead flag with THP anon] Link: http://lkml.kernel.org/r/874lm83zho.fsf@yhuang-dev.intel.com Link: http://lkml.kernel.org/r/20180227232611.169883-1-minchan@kernel.org Link: http://lkml.kernel.org/r/1509520520-32367-2-git-send-email-minchan@kernel.org Link: http://lkml.kernel.org/r/20180220085249.151400-2-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Andrew Morton Cc: Hugh Dickins Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 17 ++-------- mm/memory.c | 26 ++++---------- mm/swap_state.c | 96 +++++++++++++++++++++++++++++----------------------- 3 files changed, 62 insertions(+), 77 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index a1a3f4ed94ce..fa92177d863e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -424,12 +424,8 @@ extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, bool *new_page_allocated); extern struct page *swapin_readahead(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr); - -extern struct page *swap_readahead_detect(struct vm_fault *vmf, - struct vma_swap_readahead *swap_ra); extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, - struct vm_fault *vmf, - struct vma_swap_readahead *swap_ra); + struct vm_fault *vmf); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; @@ -548,15 +544,8 @@ static inline bool swap_use_vma_readahead(void) return false; } -static inline struct page *swap_readahead_detect( - struct vm_fault *vmf, struct vma_swap_readahead *swap_ra) -{ - return NULL; -} - -static inline struct page *do_swap_page_readahead( - swp_entry_t fentry, gfp_t gfp_mask, - struct vm_fault *vmf, struct vma_swap_readahead *swap_ra) +static inline struct page *do_swap_page_readahead(swp_entry_t fentry, + gfp_t gfp_mask, struct vm_fault *vmf) { return NULL; } diff --git a/mm/memory.c b/mm/memory.c index aed37325d94e..bc1ccff79538 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2883,26 +2883,16 @@ EXPORT_SYMBOL(unmap_mapping_range); int do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL, *swapcache = NULL; + struct page *page = NULL, *swapcache; struct mem_cgroup *memcg; - struct vma_swap_readahead swap_ra; swp_entry_t entry; pte_t pte; int locked; int exclusive = 0; int ret = 0; - bool vma_readahead = swap_use_vma_readahead(); - if (vma_readahead) { - page = swap_readahead_detect(vmf, &swap_ra); - swapcache = page; - } - - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) { - if (page) - put_page(page); + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; - } entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { @@ -2928,11 +2918,8 @@ int do_swap_page(struct vm_fault *vmf) delayacct_set_flag(DELAYACCT_PF_SWAPIN); - if (!page) { - page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, - vmf->address); - swapcache = page; - } + page = lookup_swap_cache(entry, vma, vmf->address); + swapcache = page; if (!page) { struct swap_info_struct *si = swp_swap_info(entry); @@ -2949,9 +2936,9 @@ int do_swap_page(struct vm_fault *vmf) swap_readpage(page, true); } } else { - if (vma_readahead) + if (swap_use_vma_readahead()) page = do_swap_page_readahead(entry, - GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); + GFP_HIGHUSER_MOVABLE, vmf); else page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, vmf->address); @@ -2982,7 +2969,6 @@ int do_swap_page(struct vm_fault *vmf) */ ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - swapcache = page; goto out_release; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 39ae7cfad90f..db5da2baafb1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -332,32 +332,43 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct page *page; - unsigned long ra_info; - int win, hits, readahead; page = find_get_page(swap_address_space(entry), swp_offset(entry)); INC_CACHE_INFO(find_total); if (page) { + bool vma_ra = swap_use_vma_readahead(); + bool readahead; + INC_CACHE_INFO(find_success); + /* + * At the moment, we don't support PG_readahead for anon THP + * so let's bail out rather than confusing the readahead stat. + */ if (unlikely(PageTransCompound(page))) return page; + readahead = TestClearPageReadahead(page); - if (vma) { - ra_info = GET_SWAP_RA_VAL(vma); - win = SWAP_RA_WIN(ra_info); - hits = SWAP_RA_HITS(ra_info); + if (vma && vma_ra) { + unsigned long ra_val; + int win, hits; + + ra_val = GET_SWAP_RA_VAL(vma); + win = SWAP_RA_WIN(ra_val); + hits = SWAP_RA_HITS(ra_val); if (readahead) hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(addr, win, hits)); } + if (readahead) { count_vm_event(SWAP_RA_HIT); - if (!vma) + if (!vma || !vma_ra) atomic_inc(&swapin_readahead_hits); } } + return page; } @@ -586,8 +597,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, continue; if (page_allocated) { swap_readpage(page, false); - if (offset != entry_offset && - likely(!PageTransCompound(page))) { + if (offset != entry_offset) { SetPageReadahead(page); count_vm_event(SWAP_RA); } @@ -649,16 +659,15 @@ static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); } -struct page *swap_readahead_detect(struct vm_fault *vmf, - struct vma_swap_readahead *swap_ra) +static void swap_ra_info(struct vm_fault *vmf, + struct vma_swap_readahead *ra_info) { struct vm_area_struct *vma = vmf->vma; - unsigned long swap_ra_info; - struct page *page; + unsigned long ra_val; swp_entry_t entry; unsigned long faddr, pfn, fpfn; unsigned long start, end; - pte_t *pte; + pte_t *pte, *orig_pte; unsigned int max_win, hits, prev_win, win, left; #ifndef CONFIG_64BIT pte_t *tpte; @@ -667,30 +676,32 @@ struct page *swap_readahead_detect(struct vm_fault *vmf, max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); if (max_win == 1) { - swap_ra->win = 1; - return NULL; + ra_info->win = 1; + return; } faddr = vmf->address; - entry = pte_to_swp_entry(vmf->orig_pte); - if ((unlikely(non_swap_entry(entry)))) - return NULL; - page = lookup_swap_cache(entry, vma, faddr); - if (page) - return page; + orig_pte = pte = pte_offset_map(vmf->pmd, faddr); + entry = pte_to_swp_entry(*pte); + if ((unlikely(non_swap_entry(entry)))) { + pte_unmap(orig_pte); + return; + } fpfn = PFN_DOWN(faddr); - swap_ra_info = GET_SWAP_RA_VAL(vma); - pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); - prev_win = SWAP_RA_WIN(swap_ra_info); - hits = SWAP_RA_HITS(swap_ra_info); - swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, + ra_val = GET_SWAP_RA_VAL(vma); + pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); + prev_win = SWAP_RA_WIN(ra_val); + hits = SWAP_RA_HITS(ra_val); + ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); - if (win == 1) - return NULL; + if (win == 1) { + pte_unmap(orig_pte); + return; + } /* Copy the PTEs because the page table may be unmapped */ if (fpfn == pfn + 1) @@ -703,23 +714,21 @@ struct page *swap_readahead_detect(struct vm_fault *vmf, swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, &start, &end); } - swap_ra->nr_pte = end - start; - swap_ra->offset = fpfn - start; - pte = vmf->pte - swap_ra->offset; + ra_info->nr_pte = end - start; + ra_info->offset = fpfn - start; + pte -= ra_info->offset; #ifdef CONFIG_64BIT - swap_ra->ptes = pte; + ra_info->ptes = pte; #else - tpte = swap_ra->ptes; + tpte = ra_info->ptes; for (pfn = start; pfn != end; pfn++) *tpte++ = *pte++; #endif - - return NULL; + pte_unmap(orig_pte); } struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, - struct vm_fault *vmf, - struct vma_swap_readahead *swap_ra) + struct vm_fault *vmf) { struct blk_plug plug; struct vm_area_struct *vma = vmf->vma; @@ -728,12 +737,14 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, swp_entry_t entry; unsigned int i; bool page_allocated; + struct vma_swap_readahead ra_info = {0,}; - if (swap_ra->win == 1) + swap_ra_info(vmf, &ra_info); + if (ra_info.win == 1) goto skip; blk_start_plug(&plug); - for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; + for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; i++, pte++) { pentry = *pte; if (pte_none(pentry)) @@ -749,8 +760,7 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, continue; if (page_allocated) { swap_readpage(page, false); - if (i != swap_ra->offset && - likely(!PageTransCompound(page))) { + if (i != ra_info.offset) { SetPageReadahead(page); count_vm_event(SWAP_RA); } @@ -761,7 +771,7 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, lru_add_drain(); skip: return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - swap_ra->win == 1); + ra_info.win == 1); } #ifdef CONFIG_SYSFS -- cgit v1.2.3 From e9e9b7ecee4a139a6fbe2e15ef224ca6b6c47d57 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 5 Apr 2018 16:23:42 -0700 Subject: mm: swap: unify cluster-based and vma-based swap readahead This patch makes do_swap_page() not need to be aware of two different swap readahead algorithms. Just unify cluster-based and vma-based readahead function call. Link: http://lkml.kernel.org/r/1509520520-32367-3-git-send-email-minchan@kernel.org Link: http://lkml.kernel.org/r/20180220085249.151400-3-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Andrew Morton Cc: Hugh Dickins Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 27 ++++++++------------------- mm/memory.c | 11 ++++------- mm/shmem.c | 5 ++++- mm/swap_state.c | 48 +++++++++++++++++++++++++++++++++++++----------- 4 files changed, 53 insertions(+), 38 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index fa92177d863e..2417d288e016 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -400,7 +400,6 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *, #define SWAP_ADDRESS_SPACE_SHIFT 14 #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) extern struct address_space *swapper_spaces[]; -extern bool swap_vma_readahead; #define swap_address_space(entry) \ (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ >> SWAP_ADDRESS_SPACE_SHIFT]) @@ -422,10 +421,10 @@ extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated); -extern struct page *swapin_readahead(swp_entry_t, gfp_t, - struct vm_area_struct *vma, unsigned long addr); -extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, - struct vm_fault *vmf); +extern struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, + struct vm_fault *vmf); +extern struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, + struct vm_fault *vmf); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; @@ -433,11 +432,6 @@ extern long total_swap_pages; extern atomic_t nr_rotate_swap; extern bool has_usable_swap(void); -static inline bool swap_use_vma_readahead(void) -{ - return READ_ONCE(swap_vma_readahead) && !atomic_read(&nr_rotate_swap); -} - /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) { @@ -533,19 +527,14 @@ static inline void put_swap_page(struct page *page, swp_entry_t swp) { } -static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) +static inline struct page *swap_cluster_readahead(swp_entry_t entry, + gfp_t gfp_mask, struct vm_fault *vmf) { return NULL; } -static inline bool swap_use_vma_readahead(void) -{ - return false; -} - -static inline struct page *do_swap_page_readahead(swp_entry_t fentry, - gfp_t gfp_mask, struct vm_fault *vmf) +static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, + struct vm_fault *vmf) { return NULL; } diff --git a/mm/memory.c b/mm/memory.c index bc1ccff79538..01f5464e0fd2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2927,7 +2927,8 @@ int do_swap_page(struct vm_fault *vmf) if (si->flags & SWP_SYNCHRONOUS_IO && __swap_count(si, entry) == 1) { /* skip swapcache */ - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + vmf->address); if (page) { __SetPageLocked(page); __SetPageSwapBacked(page); @@ -2936,12 +2937,8 @@ int do_swap_page(struct vm_fault *vmf) swap_readpage(page, true); } } else { - if (swap_use_vma_readahead()) - page = do_swap_page_readahead(entry, - GFP_HIGHUSER_MOVABLE, vmf); - else - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, vmf->address); + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + vmf); swapcache = page; } diff --git a/mm/shmem.c b/mm/shmem.c index b85919243399..4424fc0c33aa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1422,9 +1422,12 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, { struct vm_area_struct pvma; struct page *page; + struct vm_fault vmf; shmem_pseudo_vma_init(&pvma, info, index); - page = swapin_readahead(swap, gfp, &pvma, 0); + vmf.vma = &pvma; + vmf.address = 0; + page = swap_cluster_readahead(swap, gfp, &vmf); shmem_pseudo_vma_destroy(&pvma); return page; diff --git a/mm/swap_state.c b/mm/swap_state.c index db5da2baafb1..b97da97b6846 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -38,7 +38,7 @@ static const struct address_space_operations swap_aops = { struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; -bool swap_vma_readahead __read_mostly = true; +bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) @@ -322,6 +322,11 @@ void free_pages_and_swap_cache(struct page **pages, int nr) release_pages(pagep, nr); } +static inline bool swap_use_vma_readahead(void) +{ + return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); +} + /* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel @@ -544,11 +549,10 @@ static unsigned long swapin_nr_pages(unsigned long offset) } /** - * swapin_readahead - swap in pages in hope we need them soon + * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags - * @vma: user vma this address belongs to - * @addr: target address for mempolicy + * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. * @@ -560,10 +564,10 @@ static unsigned long swapin_nr_pages(unsigned long offset) * This has been extended to use the NUMA policies from the mm triggering * the readahead. * - * Caller must hold down_read on the vma->vm_mm if vma is not NULL. + * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL. */ -struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) +struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_fault *vmf) { struct page *page; unsigned long entry_offset = swp_offset(entry); @@ -573,6 +577,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; bool do_poll = true, page_allocated; + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -727,7 +733,7 @@ static void swap_ra_info(struct vm_fault *vmf, pte_unmap(orig_pte); } -struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, +struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct vm_fault *vmf) { struct blk_plug plug; @@ -774,20 +780,40 @@ skip: ra_info.win == 1); } +/** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @gfp_mask: memory allocation flags + * @vmf: fault information + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * It's a main entry function for swap readahead. By the configuration, + * it will read ahead blocks by cluster-based(ie, physical disk based) + * or vma-based(ie, virtual address based on faulty address) readahead. + */ +struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_fault *vmf) +{ + return swap_use_vma_readahead() ? + swap_vma_readahead(entry, gfp_mask, vmf) : + swap_cluster_readahead(entry, gfp_mask, vmf); +} + #ifdef CONFIG_SYSFS static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); + return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false"); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) - swap_vma_readahead = true; + enable_vma_readahead = true; else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) - swap_vma_readahead = false; + enable_vma_readahead = false; else return -EINVAL; -- cgit v1.2.3 From 8bd30c1090995ce800ae257fbbabfd6b6899b57c Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Thu, 5 Apr 2018 16:23:46 -0700 Subject: mm/kmemleak.c: make kmemleak_boot_config() __init The early_param() is only called during kernel initialization, So Linux marks the functions of it with __init macro to save memory. But it forgot to mark the kmemleak_boot_config(). So, Make it __init as well. Link: http://lkml.kernel.org/r/20180117034720.26897-1-douly.fnst@cn.fujitsu.com Signed-off-by: Dou Liyang Reviewed-by: Andrew Morton Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 46c2290a08f1..8029501dc65c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1963,7 +1963,7 @@ static void kmemleak_disable(void) /* * Allow boot-time kmemleak disabling (enabled by default). */ -static int kmemleak_boot_config(char *str) +static int __init kmemleak_boot_config(char *str) { if (!str) return -EINVAL; -- cgit v1.2.3 From 1173194e1e932e4aa9ce4b0ecac72446482f0e4f Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Thu, 5 Apr 2018 16:23:49 -0700 Subject: mm/page_owner.c: make early_page_owner_param() __init The early_param() is only called during kernel initialization, So Linux marks the functions of it with __init macro to save memory. But it forgot to mark the early_page_owner_param(). So, Make it __init as well. Link: http://lkml.kernel.org/r/20180117034736.26963-1-douly.fnst@cn.fujitsu.com Signed-off-by: Dou Liyang Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Michal Hocko Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 7172e0a80e13..75d21a2259b3 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -35,7 +35,7 @@ static depot_stack_handle_t early_handle; static void init_early_allocated_pages(void); -static int early_page_owner_param(char *buf) +static int __init early_page_owner_param(char *buf) { if (!buf) return -EINVAL; -- cgit v1.2.3 From 14298d3663e44705d3f9106343ac8298a5fdff99 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Thu, 5 Apr 2018 16:23:53 -0700 Subject: mm/page_poison.c: make early_page_poison_param() __init The early_param() is only called during kernel initialization, So Linux marks the function of it with __init macro to save memory. But it forgot to mark the early_page_poison_param(). So, Make it __init as well. Link: http://lkml.kernel.org/r/20180117034757.27024-1-douly.fnst@cn.fujitsu.com Signed-off-by: Dou Liyang Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Philippe Ombredanne Cc: Kate Stewart Cc: Michal Hocko Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_poison.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_poison.c b/mm/page_poison.c index e83fd44867de..aa2b3d34e8ea 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -9,7 +9,7 @@ static bool want_page_poisoning __read_mostly; -static int early_page_poison_param(char *buf) +static int __init early_page_poison_param(char *buf) { if (!buf) return -EINVAL; -- cgit v1.2.3 From 4f6923fbb352d126659cabe34806cff75c7b5ea0 Mon Sep 17 00:00:00 2001 From: Howard McLauchlan Date: Thu, 5 Apr 2018 16:23:57 -0700 Subject: mm: make should_failslab always available for fault injection should_failslab() is a convenient function to hook into for directed error injection into kmalloc(). However, it is only available if a config flag is set. The following BCC script, for example, fails kmalloc() calls after a btrfs umount: from bcc import BPF prog = r""" BPF_HASH(flag); #include int kprobe__btrfs_close_devices(void *ctx) { u64 key = 1; flag.update(&key, &key); return 0; } int kprobe__should_failslab(struct pt_regs *ctx) { u64 key = 1; u64 *res; res = flag.lookup(&key); if (res != 0) { bpf_override_return(ctx, -ENOMEM); } return 0; } """ b = BPF(text=prog) while 1: b.kprobe_poll() This patch refactors the should_failslab implementation so that the function is always available for error injection, independent of flags. This change would be similar in nature to commit f5490d3ec921 ("block: Add should_fail_bio() for bpf error injection"). Link: http://lkml.kernel.org/r/20180222020320.6944-1-hmclauchlan@fb.com Signed-off-by: Howard McLauchlan Reviewed-by: Andrew Morton Cc: Akinobu Mita Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Josef Bacik Cc: Johannes Weiner Cc: Alexei Starovoitov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fault-inject.h | 5 +++-- mm/failslab.c | 2 +- mm/slab_common.c | 8 ++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index c3c95d18bf43..7e6c77740413 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -64,10 +64,11 @@ static inline struct dentry *fault_create_debugfs_attr(const char *name, struct kmem_cache; +int should_failslab(struct kmem_cache *s, gfp_t gfpflags); #ifdef CONFIG_FAILSLAB -extern bool should_failslab(struct kmem_cache *s, gfp_t gfpflags); +extern bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags); #else -static inline bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) +static inline bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) { return false; } diff --git a/mm/failslab.c b/mm/failslab.c index 8087d976a809..1f2f248e3601 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -14,7 +14,7 @@ static struct { .cache_filter = false, }; -bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) +bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) { /* No fault-injection for bootstrap cache */ if (unlikely(s == kmem_cache)) diff --git a/mm/slab_common.c b/mm/slab_common.c index 2e682a4c8877..98dcdc352062 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1516,3 +1516,11 @@ EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); + +int should_failslab(struct kmem_cache *s, gfp_t gfpflags) +{ + if (__should_failslab(s, gfpflags)) + return -ENOMEM; + return 0; +} +ALLOW_ERROR_INJECTION(should_failslab, ERRNO); -- cgit v1.2.3 From bc3106b26cf6a6f214fd1a8538736afc39ae1b5c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Apr 2018 16:24:02 -0700 Subject: mm, compaction: drain pcps for zone when kcompactd fails It's possible for free pages to become stranded on per-cpu pagesets (pcps) that, if drained, could be merged with buddy pages on the zone's free area to form large order pages, including up to MAX_ORDER. Consider a verbose example using the tools/vm/page-types tool at the beginning of a ZONE_NORMAL ('B' indicates a buddy page and 'S' indicates a slab page). Pages on pcps do not have any page flags set. 109954 1 _______S________________________________________________________ 109955 2 __________B_____________________________________________________ 109957 1 ________________________________________________________________ 109958 1 __________B_____________________________________________________ 109959 7 ________________________________________________________________ 109960 1 __________B_____________________________________________________ 109961 9 ________________________________________________________________ 10996a 1 __________B_____________________________________________________ 10996b 3 ________________________________________________________________ 10996e 1 __________B_____________________________________________________ 10996f 1 ________________________________________________________________ ... 109f8c 1 __________B_____________________________________________________ 109f8d 2 ________________________________________________________________ 109f8f 2 __________B_____________________________________________________ 109f91 f ________________________________________________________________ 109fa0 1 __________B_____________________________________________________ 109fa1 7 ________________________________________________________________ 109fa8 1 __________B_____________________________________________________ 109fa9 1 ________________________________________________________________ 109faa 1 __________B_____________________________________________________ 109fab 1 _______S________________________________________________________ The compaction migration scanner is attempting to defragment this memory since it is at the beginning of the zone. It has done so quite well, all movable pages have been migrated. From pfn [0x109955, 0x109fab), there are only buddy pages and pages without flags set. These pages may be stranded on pcps that could otherwise allow this memory to be coalesced if freed back to the zone free area. It is possible that some of these pages may not be on pcps and that something has called alloc_pages() and used the memory directly, but we rely on the absence of __GFP_MOVABLE in these cases to allocate from MIGATE_UNMOVABLE pageblocks to try to keep these MIGRATE_MOVABLE pageblocks as free as possible. These buddy and pcp pages, spanning 1,621 pages, could be coalesced and allow for three transparent hugepages to be dynamically allocated. Running the numbers for all such spans on the system, it was found that there were over 400 such spans of only buddy pages and pages without flags set at the time this /proc/kpageflags sample was collected. Without this support, there were _no_ order-9 or order-10 pages free. When kcompactd fails to defragment memory such that a cc.order page can be allocated, drain all pcps for the zone back to the buddy allocator so this stranding cannot occur. Compaction for that order will subsequently be deferred, which acts as a ratelimit on this drain. Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1803010340100.88270@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 2c8999d027ab..a68230ab451d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1987,6 +1987,14 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (status == COMPACT_SUCCESS) { compaction_defer_reset(zone, cc.order, false); } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { + /* + * Buddy pages may become stranded on pcps that could + * otherwise coalesce on the zone's free area for + * order >= cc.order. This is ratelimited by the + * upcoming deferral. + */ + drain_all_pages(zone); + /* * We use sync migration mode here, so we defer like * sync direct compaction does. -- cgit v1.2.3 From 77ba9062e43c7e4966d9ff3afd87dca86542f86a Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 5 Apr 2018 16:24:06 -0700 Subject: mm/free_pcppages_bulk: update pcp->count inside Matthew Wilcox found that all callers of free_pcppages_bulk() currently update pcp->count immediately after so it's natural to do it inside free_pcppages_bulk(). No functionality or performance change is expected from this patch. Link: http://lkml.kernel.org/r/20180301062845.26038-2-aaron.lu@intel.com Signed-off-by: Aaron Lu Suggested-by: Matthew Wilcox Acked-by: David Rientjes Acked-by: Michal Hocko Reviewed-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Huang Ying Cc: Dave Hansen Cc: Kemi Wang Cc: Tim Chen Cc: Andi Kleen Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 86b7f0430e02..08c195cdf161 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1112,6 +1112,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, page = list_last_entry(list, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); + pcp->count--; mt = get_pcppage_migratetype(page); /* MIGRATE_ISOLATE page should not go to pcplists */ @@ -2495,10 +2496,8 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) local_irq_save(flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); - if (to_drain > 0) { + if (to_drain > 0) free_pcppages_bulk(zone, to_drain, pcp); - pcp->count -= to_drain; - } local_irq_restore(flags); } #endif @@ -2520,10 +2519,8 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - if (pcp->count) { + if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; - } local_irq_restore(flags); } @@ -2747,7 +2744,6 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) if (pcp->count >= pcp->high) { unsigned long batch = READ_ONCE(pcp->batch); free_pcppages_bulk(zone, batch, pcp); - pcp->count -= batch; } } -- cgit v1.2.3 From 0a5f4e5b45625e75db85b4968fc4c232d8091143 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 5 Apr 2018 16:24:10 -0700 Subject: mm/free_pcppages_bulk: do not hold lock when picking pages to free When freeing a batch of pages from Per-CPU-Pages(PCP) back to buddy, the zone->lock is held and then pages are chosen from PCP's migratetype list. While there is actually no need to do this 'choose part' under lock since it's PCP pages, the only CPU that can touch them is us and irq is also disabled. Moving this part outside could reduce lock held time and improve performance. Test with will-it-scale/page_fault1 full load: kernel Broadwell(2S) Skylake(2S) Broadwell(4S) Skylake(4S) v4.16-rc2+ 9034215 7971818 13667135 15677465 this patch 9536374 +5.6% 8314710 +4.3% 14070408 +3.0% 16675866 +6.4% What the test does is: starts $nr_cpu processes and each will repeatedly do the following for 5 minutes: - mmap 128M anonymouse space - write access to that space - munmap. The score is the aggregated iteration. https://github.com/antonblanchard/will-it-scale/blob/master/tests/page_fault1.c Link: http://lkml.kernel.org/r/20180301062845.26038-3-aaron.lu@intel.com Signed-off-by: Aaron Lu Acked-by: Mel Gorman Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Andi Kleen Cc: Dave Hansen Cc: David Rientjes Cc: Huang Ying Cc: Kemi Wang Cc: Matthew Wilcox Cc: Tim Chen Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 08c195cdf161..e29a6ba050c8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1080,12 +1080,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, int migratetype = 0; int batch_free = 0; bool isolated_pageblocks; - - spin_lock(&zone->lock); - isolated_pageblocks = has_isolate_pageblock(zone); + struct page *page, *tmp; + LIST_HEAD(head); while (count) { - struct page *page; struct list_head *list; /* @@ -1107,27 +1105,36 @@ static void free_pcppages_bulk(struct zone *zone, int count, batch_free = count; do { - int mt; /* migratetype of the to-be-freed page */ - page = list_last_entry(list, struct page, lru); - /* must delete as __free_one_page list manipulates */ + /* must delete to avoid corrupting pcp list */ list_del(&page->lru); pcp->count--; - mt = get_pcppage_migratetype(page); - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); - if (bulkfree_pcp_prepare(page)) continue; - __free_one_page(page, page_to_pfn(page), zone, 0, mt); - trace_mm_page_pcpu_drain(page, 0, mt); + list_add_tail(&page->lru, &head); } while (--count && --batch_free && !list_empty(list)); } + + spin_lock(&zone->lock); + isolated_pageblocks = has_isolate_pageblock(zone); + + /* + * Use safe version since after __free_one_page(), + * page->lru.next will not point to original list. + */ + list_for_each_entry_safe(page, tmp, &head, lru) { + int mt = get_pcppage_migratetype(page); + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ + if (unlikely(isolated_pageblocks)) + mt = get_pageblock_migratetype(page); + + __free_one_page(page, page_to_pfn(page), zone, 0, mt); + trace_mm_page_pcpu_drain(page, 0, mt); + } spin_unlock(&zone->lock); } -- cgit v1.2.3 From 97334162e4d79f866edd7308aac0ab3ab7a103f7 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 5 Apr 2018 16:24:14 -0700 Subject: mm/free_pcppages_bulk: prefetch buddy while not holding lock When a page is freed back to the global pool, its buddy will be checked to see if it's possible to do a merge. This requires accessing buddy's page structure and that access could take a long time if it's cache cold. This patch adds a prefetch to the to-be-freed page's buddy outside of zone->lock in hope of accessing buddy's page structure later under zone->lock will be faster. Since we *always* do buddy merging and check an order-0 page's buddy to try to merge it when it goes into the main allocator, the cacheline will always come in, i.e. the prefetched data will never be unused. Normally, the number of prefetch will be pcp->batch(default=31 and has an upper limit of (PAGE_SHIFT * 8)=96 on x86_64) but in the case of pcp's pages get all drained, it will be pcp->count which has an upper limit of pcp->high. pcp->high, although has a default value of 186 (pcp->batch=31 * 6), can be changed by user through /proc/sys/vm/percpu_pagelist_fraction and there is no software upper limit so could be large, like several thousand. For this reason, only the first pcp->batch number of page's buddy structure is prefetched to avoid excessive prefetching. In the meantime, there are two concerns: 1. the prefetch could potentially evict existing cachelines, especially for L1D cache since it is not huge 2. there is some additional instruction overhead, namely calculating buddy pfn twice For 1, it's hard to say, this microbenchmark though shows good result but the actual benefit of this patch will be workload/CPU dependant; For 2, since the calculation is a XOR on two local variables, it's expected in many cases that cycles spent will be offset by reduced memory latency later. This is especially true for NUMA machines where multiple CPUs are contending on zone->lock and the most time consuming part under zone->lock is the wait of 'struct page' cacheline of the to-be-freed pages and their buddies. Test with will-it-scale/page_fault1 full load: kernel Broadwell(2S) Skylake(2S) Broadwell(4S) Skylake(4S) v4.16-rc2+ 9034215 7971818 13667135 15677465 patch2/3 9536374 +5.6% 8314710 +4.3% 14070408 +3.0% 16675866 +6.4% this patch 10180856 +6.8% 8506369 +2.3% 14756865 +4.9% 17325324 +3.9% Note: this patch's performance improvement percent is against patch2/3. (Changelog stolen from Dave Hansen and Mel Gorman's comments at http://lkml.kernel.org/r/148a42d8-8306-2f2f-7f7c-86bc118f8ccd@intel.com) [aaron.lu@intel.com: use helper function, avoid disordering pages] Link: http://lkml.kernel.org/r/20180301062845.26038-4-aaron.lu@intel.com Link: http://lkml.kernel.org/r/20180320113146.GB24737@intel.com [aaron.lu@intel.com: v4] Link: http://lkml.kernel.org/r/20180301062845.26038-4-aaron.lu@intel.com Link: http://lkml.kernel.org/r/20180309082431.GB30868@intel.com Link: http://lkml.kernel.org/r/20180301062845.26038-4-aaron.lu@intel.com Signed-off-by: Aaron Lu Suggested-by: Ying Huang Reviewed-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Hansen Cc: David Rientjes Cc: Kemi Wang Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e29a6ba050c8..f6005b7c3446 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1063,6 +1063,15 @@ static bool bulkfree_pcp_prepare(struct page *page) } #endif /* CONFIG_DEBUG_VM */ +static inline void prefetch_buddy(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0); + struct page *buddy = page + (buddy_pfn - pfn); + + prefetch(buddy); +} + /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. @@ -1079,6 +1088,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; + int prefetch_nr = 0; bool isolated_pageblocks; struct page *page, *tmp; LIST_HEAD(head); @@ -1114,6 +1124,18 @@ static void free_pcppages_bulk(struct zone *zone, int count, continue; list_add_tail(&page->lru, &head); + + /* + * We are going to put the page back to the global + * pool, prefetch its buddy to speed up later access + * under zone->lock. It is believed the overhead of + * an additional test and calculating buddy_pfn here + * can be offset by reduced memory latency later. To + * avoid excessive prefetching due to large count, only + * prefetch buddy for the first pcp->batch nr of pages. + */ + if (prefetch_nr++ < pcp->batch) + prefetch_buddy(page); } while (--count && --batch_free && !list_empty(list)); } -- cgit v1.2.3 From 2923117b7162946042e49cd363846c498293230c Mon Sep 17 00:00:00 2001 From: Mario Leinweber Date: Thu, 5 Apr 2018 16:24:18 -0700 Subject: mm/gup.c: fix coding style issues. - Fixed style error: 8 spaces -> 1 tab. - Fixed style warning: Corrected misleading indentation. Link: http://lkml.kernel.org/r/20180302210254.31888-1-marioleinweber@web.de Signed-off-by: Mario Leinweber Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 6afae32571ca..f296df6cf666 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -531,7 +531,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, * reCOWed by userspace write). */ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) - *flags |= FOLL_COW; + *flags |= FOLL_COW; return 0; } @@ -1638,7 +1638,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, PMD_SHIFT, next, write, pages, nr)) return 0; } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) - return 0; + return 0; } while (pmdp++, addr = next, addr != end); return 1; -- cgit v1.2.3 From 09135cc594d141cd279c32a18b91cb3bd3fe8cc5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Apr 2018 16:24:21 -0700 Subject: mm, powerpc: use vma_kernel_pagesize() in vma_mmu_pagesize() Patch series "mm, smaps: MMUPageSize for device-dax", v3. Similar to commit 31383c6865a5 ("mm, hugetlbfs: introduce ->split() to vm_operations_struct") here is another occasion where we want special-case hugetlbfs/hstate enabling to also apply to device-dax. This prompts the question what other hstate conversions we might do beyond ->split() and ->pagesize(), but this appears to be the last of the usages of hstate_vma() in generic/non-hugetlbfs specific code paths. This patch (of 3): The current powerpc definition of vma_mmu_pagesize() open codes looking up the page size via hstate. It is identical to the generic vma_kernel_pagesize() implementation. Now, vma_kernel_pagesize() is growing support for determining the page size of Device-DAX vmas in addition to the existing Hugetlbfs page size determination. Ideally, if the powerpc vma_mmu_pagesize() used vma_kernel_pagesize() it would automatically benefit from any new vma-type support that is added to vma_kernel_pagesize(). However, the powerpc vma_mmu_pagesize() is prevented from calling vma_kernel_pagesize() due to a circular header dependency that requires vma_mmu_pagesize() to be defined before including . Break this circular dependency by defining the default vma_mmu_pagesize() as a __weak symbol to be overridden by the powerpc version. Link: http://lkml.kernel.org/r/151996254179.27922.2213728278535578744.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Jane Chu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/hugetlb.h | 6 ------ arch/powerpc/mm/hugetlbpage.c | 5 +---- mm/hugetlb.c | 8 +++----- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 1a4847f67ea8..6f6751d3eba9 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -117,12 +117,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -/* - * The version of vma_mmu_pagesize() in arch/powerpc/mm/hugetlbpage.c needs - * to override the version in mm/hugetlb.c - */ -#define vma_mmu_pagesize vma_mmu_pagesize - /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 876da2bc1796..3a08d211d2ee 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -568,10 +568,7 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) if (!radix_enabled()) return 1UL << mmu_psize_to_shift(psize); #endif - if (!is_vm_hugetlb_page(vma)) - return PAGE_SIZE; - - return huge_page_size(hstate_vma(vma)); + return vma_kernel_pagesize(vma); } static inline bool is_power_of_4(unsigned long x) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 976bbc5646fe..92c49b9d7cbb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -651,15 +651,13 @@ EXPORT_SYMBOL_GPL(vma_kernel_pagesize); /* * Return the page size being used by the MMU to back a VMA. In the majority * of cases, the page size used by the kernel matches the MMU size. On - * architectures where it differs, an architecture-specific version of this - * function is required. + * architectures where it differs, an architecture-specific 'strong' + * version of this symbol is required. */ -#ifndef vma_mmu_pagesize -unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return vma_kernel_pagesize(vma); } -#endif /* * Flags for MAP_PRIVATE reservations. These are stored in the bottom -- cgit v1.2.3 From 05ea88608d4e135695571727f5d7f22967d2a3bf Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Apr 2018 16:24:25 -0700 Subject: mm, hugetlbfs: introduce ->pagesize() to vm_operations_struct When device-dax is operating in huge-page mode we want it to behave like hugetlbfs and report the MMU page mapping size that is being enforced by the vma. Similar to commit 31383c6865a5 "mm, hugetlbfs: introduce ->split() to vm_operations_struct" it would be messy to teach vma_mmu_pagesize() about device-dax page mapping sizes in the same (hstate) way that hugetlbfs communicates this attribute. Instead, these patches introduce a new ->pagesize() vm operation. Link: http://lkml.kernel.org/r/151996254734.27922.15813097401404359642.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Jane Chu Reviewed-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/hugetlb.c | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e2be527642a..7c06581edaa2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -390,6 +390,7 @@ struct vm_operations_struct { int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); + unsigned long (*pagesize)(struct vm_area_struct * area); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 92c49b9d7cbb..218679138255 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -637,14 +637,9 @@ EXPORT_SYMBOL_GPL(linear_hugepage_index); */ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { - struct hstate *hstate; - - if (!is_vm_hugetlb_page(vma)) - return PAGE_SIZE; - - hstate = hstate_vma(vma); - - return 1UL << huge_page_shift(hstate); + if (vma->vm_ops && vma->vm_ops->pagesize) + return vma->vm_ops->pagesize(vma); + return PAGE_SIZE; } EXPORT_SYMBOL_GPL(vma_kernel_pagesize); @@ -3151,6 +3146,13 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) return 0; } +static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) +{ + struct hstate *hstate = hstate_vma(vma); + + return 1UL << huge_page_shift(hstate); +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the @@ -3168,6 +3170,7 @@ const struct vm_operations_struct hugetlb_vm_ops = { .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, .split = hugetlb_vm_op_split, + .pagesize = hugetlb_vm_op_pagesize, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, -- cgit v1.2.3 From c1d53b92b95c23909de11a97bcc51f26a9509231 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Apr 2018 16:24:28 -0700 Subject: device-dax: implement ->pagesize() for smaps to report MMUPageSize Given that device-dax is making similar page mapping size guarantees as hugetlbfs, emit the size in smaps and any other kernel path that requests the mapping size of a vma. Link: http://lkml.kernel.org/r/151996255287.27922.18397777516059080245.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Jane Chu Reviewed-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 2137dbc29877..0b61f48f21a6 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -439,10 +439,20 @@ static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr) return 0; } +static unsigned long dev_dax_pagesize(struct vm_area_struct *vma) +{ + struct file *filp = vma->vm_file; + struct dev_dax *dev_dax = filp->private_data; + struct dax_region *dax_region = dev_dax->region; + + return dax_region->align; +} + static const struct vm_operations_struct dax_vm_ops = { .fault = dev_dax_fault, .huge_fault = dev_dax_huge_fault, .split = dev_dax_split, + .pagesize = dev_dax_pagesize, }; static int dax_mmap(struct file *filp, struct vm_area_struct *vma) -- cgit v1.2.3 From 5844a486daf2705dcdbfabe869a698bdfe629f54 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 5 Apr 2018 16:24:32 -0700 Subject: include/linux/mm.h: provide consistent declaration for num_poisoned_pages clang reports the following compile warning. In file included from mm/vmscan.c:56: ./include/linux/swapops.h:327:22: warning: section attribute is specified on redeclared variable [-Wsection] extern atomic_long_t num_poisoned_pages __read_mostly; ^ ./include/linux/mm.h:2585:22: note: previous declaration is here extern atomic_long_t num_poisoned_pages; ^ Let's use __read_mostly everywhere. Link: http://lkml.kernel.org/r/1519686565-8224-1-git-send-email-linux@roeck-us.net Signed-off-by: Guenter Roeck Reviewed-by: Andrew Morton Cc: Matthias Kaehlcke Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c06581edaa2..40fca1b2b6a1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2592,7 +2592,7 @@ extern int get_hwpoison_page(struct page *page); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); -extern atomic_long_t num_poisoned_pages; +extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(struct page *page, int flags); -- cgit v1.2.3 From 1c0ff0f1bdeb183608ac9a345996d4d1ebfed632 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 5 Apr 2018 16:24:36 -0700 Subject: fs/direct-io.c: minor cleanups in do_blockdev_direct_IO We already get the block counts and calculate the end block at the beginning of the function. Let's use the local variables for consistency and readability. No functional changes [akpm@linux-foundation.org: constify the locals to prevent future slipups] Link: http://lkml.kernel.org/r/1519638870-17756-1-git-send-email-nborisov@suse.com Signed-off-by: Nikolay Borisov Reviewed-by: Jeff Moyer Cc: Al Viro Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 1357ef563893..1effd7bc5d02 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1178,9 +1178,9 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; - size_t count = iov_iter_count(iter); + const size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; - loff_t end = offset + count; + const loff_t end = offset + count; struct dio *dio; struct dio_submit sdio = { 0, }; struct buffer_head map_bh = { 0, }; @@ -1201,7 +1201,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } /* watch out for a 0 len io from a tricksy fs */ - if (iov_iter_rw(iter) == READ && !iov_iter_count(iter)) + if (iov_iter_rw(iter) == READ && !count) return 0; dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); @@ -1318,8 +1318,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->should_dirty = (iter->type == ITER_IOVEC); sdio.iter = iter; - sdio.final_block_in_request = - (offset + iov_iter_count(iter)) >> blkbits; + sdio.final_block_in_request = end >> blkbits; /* * In case of non-aligned buffers, we may need 2 more -- cgit v1.2.3 From cb9f753a3731f7fe16447bea45cb6f8e8bb432fb Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 5 Apr 2018 16:24:39 -0700 Subject: mm: fix races between swapoff and flush dcache Thanks to commit 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks"), after swapoff the address_space associated with the swap device will be freed. So page_mapping() users which may touch the address_space need some kind of mechanism to prevent the address_space from being freed during accessing. The dcache flushing functions (flush_dcache_page(), etc) in architecture specific code may access the address_space of swap device for anonymous pages in swap cache via page_mapping() function. But in some cases there are no mechanisms to prevent the swap device from being swapoff, for example, CPU1 CPU2 __get_user_pages() swapoff() flush_dcache_page() mapping = page_mapping() ... exit_swap_address_space() ... kvfree(spaces) mapping_mapped(mapping) The address space may be accessed after being freed. But from cachetlb.txt and Russell King, flush_dcache_page() only care about file cache pages, for anonymous pages, flush_anon_page() should be used. The implementation of flush_dcache_page() in all architectures follows this too. They will check whether page_mapping() is NULL and whether mapping_mapped() is true to determine whether to flush the dcache immediately. And they will use interval tree (mapping->i_mmap) to find all user space mappings. While mapping_mapped() and mapping->i_mmap isn't used by anonymous pages in swap cache at all. So, to fix the race between swapoff and flush dcache, __page_mapping() is add to return the address_space for file cache pages and NULL otherwise. All page_mapping() invoking in flush dcache functions are replaced with page_mapping_file(). [akpm@linux-foundation.org: simplify page_mapping_file(), per Mike] Link: http://lkml.kernel.org/r/20180305083634.15174-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Andrew Morton Cc: Minchan Kim Cc: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Dave Hansen Cc: Chen Liqin Cc: Russell King Cc: Yoshinori Sato Cc: "James E.J. Bottomley" Cc: Guan Xuetao Cc: "David S. Miller" Cc: Chris Zankel Cc: Vineet Gupta Cc: Ley Foon Tan Cc: Ralf Baechle Cc: Andi Kleen Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/mm/cache.c | 2 +- arch/arm/mm/copypage-v4mc.c | 2 +- arch/arm/mm/copypage-v6.c | 2 +- arch/arm/mm/copypage-xscale.c | 2 +- arch/arm/mm/fault-armv.c | 2 +- arch/arm/mm/flush.c | 6 +++--- arch/mips/mm/cache.c | 2 +- arch/nios2/mm/cacheflush.c | 4 ++-- arch/parisc/kernel/cache.c | 5 +++-- arch/sh/mm/cache-sh4.c | 2 +- arch/sh/mm/cache-sh7705.c | 2 +- arch/sparc/kernel/smp_64.c | 8 ++++---- arch/sparc/mm/init_64.c | 6 +++--- arch/sparc/mm/tlb.c | 2 +- arch/unicore32/mm/flush.c | 2 +- arch/unicore32/mm/mmu.c | 2 +- arch/xtensa/mm/cache.c | 2 +- include/linux/mm.h | 1 + mm/util.c | 10 ++++++++++ 19 files changed, 38 insertions(+), 26 deletions(-) diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 2072f3451e9c..9dbe645ee127 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -833,7 +833,7 @@ void flush_dcache_page(struct page *page) } /* don't handle anon pages here */ - mapping = page_mapping(page); + mapping = page_mapping_file(page); if (!mapping) return; diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c index 1267e64133b9..0224416cba3c 100644 --- a/arch/arm/mm/copypage-v4mc.c +++ b/arch/arm/mm/copypage-v4mc.c @@ -70,7 +70,7 @@ void v4_mc_copy_user_highpage(struct page *to, struct page *from, void *kto = kmap_atomic(to); if (!test_and_set_bit(PG_dcache_clean, &from->flags)) - __flush_dcache_page(page_mapping(from), from); + __flush_dcache_page(page_mapping_file(from), from); raw_spin_lock(&minicache_lock); diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c index 70423345da26..a698e575e321 100644 --- a/arch/arm/mm/copypage-v6.c +++ b/arch/arm/mm/copypage-v6.c @@ -76,7 +76,7 @@ static void v6_copy_user_highpage_aliasing(struct page *to, unsigned long kfrom, kto; if (!test_and_set_bit(PG_dcache_clean, &from->flags)) - __flush_dcache_page(page_mapping(from), from); + __flush_dcache_page(page_mapping_file(from), from); /* FIXME: not highmem safe */ discard_old_kernel_data(page_address(to)); diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c index 0fb85025344d..97972379f4d6 100644 --- a/arch/arm/mm/copypage-xscale.c +++ b/arch/arm/mm/copypage-xscale.c @@ -90,7 +90,7 @@ void xscale_mc_copy_user_highpage(struct page *to, struct page *from, void *kto = kmap_atomic(to); if (!test_and_set_bit(PG_dcache_clean, &from->flags)) - __flush_dcache_page(page_mapping(from), from); + __flush_dcache_page(page_mapping_file(from), from); raw_spin_lock(&minicache_lock); diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index d9e0d00a6699..4d75dae5ac96 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -195,7 +195,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, if (page == ZERO_PAGE(0)) return; - mapping = page_mapping(page); + mapping = page_mapping_file(page); if (!test_and_set_bit(PG_dcache_clean, &page->flags)) __flush_dcache_page(mapping, page); if (mapping) { diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index f1e6190aa7ea..58469623b015 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -285,7 +285,7 @@ void __sync_icache_dcache(pte_t pteval) page = pfn_to_page(pfn); if (cache_is_vipt_aliasing()) - mapping = page_mapping(page); + mapping = page_mapping_file(page); else mapping = NULL; @@ -333,7 +333,7 @@ void flush_dcache_page(struct page *page) return; } - mapping = page_mapping(page); + mapping = page_mapping_file(page); if (!cache_ops_need_broadcast() && mapping && !page_mapcount(page)) @@ -363,7 +363,7 @@ void flush_kernel_dcache_page(struct page *page) if (cache_is_vivt() || cache_is_vipt_aliasing()) { struct address_space *mapping; - mapping = page_mapping(page); + mapping = page_mapping_file(page); if (!mapping || mapping_mapped(mapping)) { void *addr; diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 44ac64d51827..0d3c656feba0 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -86,7 +86,7 @@ SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, bytes, void __flush_dcache_page(struct page *page) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = page_mapping_file(page); unsigned long addr; if (mapping && !mapping_mapped(mapping)) { diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c index 87bf88ed04c6..506f6e1c86d5 100644 --- a/arch/nios2/mm/cacheflush.c +++ b/arch/nios2/mm/cacheflush.c @@ -180,7 +180,7 @@ void flush_dcache_page(struct page *page) if (page == ZERO_PAGE(0)) return; - mapping = page_mapping(page); + mapping = page_mapping_file(page); /* Flush this page if there are aliases. */ if (mapping && !mapping_mapped(mapping)) { @@ -215,7 +215,7 @@ void update_mmu_cache(struct vm_area_struct *vma, if (page == ZERO_PAGE(0)) return; - mapping = page_mapping(page); + mapping = page_mapping_file(page); if (!test_and_set_bit(PG_dcache_clean, &page->flags)) __flush_dcache_page(mapping, page); diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index e3b45546d589..a99da95fc9fd 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -88,7 +88,8 @@ update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) return; page = pfn_to_page(pfn); - if (page_mapping(page) && test_bit(PG_dcache_dirty, &page->flags)) { + if (page_mapping_file(page) && + test_bit(PG_dcache_dirty, &page->flags)) { flush_kernel_dcache_page_addr(pfn_va(pfn)); clear_bit(PG_dcache_dirty, &page->flags); } else if (parisc_requires_coherency()) @@ -304,7 +305,7 @@ __flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, void flush_dcache_page(struct page *page) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = page_mapping_file(page); struct vm_area_struct *mpnt; unsigned long offset; unsigned long addr, old_addr = 0; diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index 58aaa4f33b81..eee911422cf9 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -112,7 +112,7 @@ static void sh4_flush_dcache_page(void *arg) struct page *page = arg; unsigned long addr = (unsigned long)page_address(page); #ifndef CONFIG_SMP - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = page_mapping_file(page); if (mapping && !mapping_mapped(mapping)) clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c index 6cd2aa395817..ed25eba80667 100644 --- a/arch/sh/mm/cache-sh7705.c +++ b/arch/sh/mm/cache-sh7705.c @@ -136,7 +136,7 @@ static void __flush_dcache_page(unsigned long phys) static void sh7705_flush_dcache_page(void *arg) { struct page *page = arg; - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = page_mapping_file(page); if (mapping && !mapping_mapped(mapping)) clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index c50182cd2f64..d3ea1f3c06a0 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -929,9 +929,9 @@ static inline void __local_flush_dcache_page(struct page *page) #ifdef DCACHE_ALIASING_POSSIBLE __flush_dcache_page(page_address(page), ((tlb_type == spitfire) && - page_mapping(page) != NULL)); + page_mapping_file(page) != NULL)); #else - if (page_mapping(page) != NULL && + if (page_mapping_file(page) != NULL && tlb_type == spitfire) __flush_icache_page(__pa(page_address(page))); #endif @@ -958,7 +958,7 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu) if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page_mapping(page) != NULL) + if (page_mapping_file(page) != NULL) data0 |= ((u64)1 << 32); } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { #ifdef DCACHE_ALIASING_POSSIBLE @@ -994,7 +994,7 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page) pg_addr = page_address(page); if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page_mapping(page) != NULL) + if (page_mapping_file(page) != NULL) data0 |= ((u64)1 << 32); } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { #ifdef DCACHE_ALIASING_POSSIBLE diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index cb9ebac6663f..8aeb1aabe76e 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -206,9 +206,9 @@ inline void flush_dcache_page_impl(struct page *page) #ifdef DCACHE_ALIASING_POSSIBLE __flush_dcache_page(page_address(page), ((tlb_type == spitfire) && - page_mapping(page) != NULL)); + page_mapping_file(page) != NULL)); #else - if (page_mapping(page) != NULL && + if (page_mapping_file(page) != NULL && tlb_type == spitfire) __flush_icache_page(__pa(page_address(page))); #endif @@ -490,7 +490,7 @@ void flush_dcache_page(struct page *page) this_cpu = get_cpu(); - mapping = page_mapping(page); + mapping = page_mapping_file(page); if (mapping && !mapping_mapped(mapping)) { int dirty = test_bit(PG_dcache_dirty, &page->flags); if (dirty) { diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index b5cfab711651..3d72d2deb13b 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -128,7 +128,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, goto no_cache_flush; /* A real file page? */ - mapping = page_mapping(page); + mapping = page_mapping_file(page); if (!mapping) goto no_cache_flush; diff --git a/arch/unicore32/mm/flush.c b/arch/unicore32/mm/flush.c index 6d4c096ffa2a..74f4d636df2d 100644 --- a/arch/unicore32/mm/flush.c +++ b/arch/unicore32/mm/flush.c @@ -83,7 +83,7 @@ void flush_dcache_page(struct page *page) if (page == ZERO_PAGE(0)) return; - mapping = page_mapping(page); + mapping = page_mapping_file(page); if (mapping && !mapping_mapped(mapping)) clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c index 4f5a532bee13..0c94b7b4514d 100644 --- a/arch/unicore32/mm/mmu.c +++ b/arch/unicore32/mm/mmu.c @@ -503,7 +503,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, if (page == ZERO_PAGE(0)) return; - mapping = page_mapping(page); + mapping = page_mapping_file(page); if (!test_and_set_bit(PG_dcache_clean, &page->flags)) __flush_dcache_page(mapping, page); if (mapping) diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c index 57dc231a0709..9220dcde7520 100644 --- a/arch/xtensa/mm/cache.c +++ b/arch/xtensa/mm/cache.c @@ -127,7 +127,7 @@ EXPORT_SYMBOL(copy_user_highpage); void flush_dcache_page(struct page *page) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = page_mapping_file(page); /* * If we have a mapping but the page is not mapped to user-space diff --git a/include/linux/mm.h b/include/linux/mm.h index 40fca1b2b6a1..88d82ba29d72 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1155,6 +1155,7 @@ static inline pgoff_t page_index(struct page *page) bool page_mapped(struct page *page); struct address_space *page_mapping(struct page *page); +struct address_space *page_mapping_file(struct page *page); /* * Return true only if the page has been allocated with diff --git a/mm/util.c b/mm/util.c index c1250501364f..029fc2f3b395 100644 --- a/mm/util.c +++ b/mm/util.c @@ -515,6 +515,16 @@ struct address_space *page_mapping(struct page *page) } EXPORT_SYMBOL(page_mapping); +/* + * For file cache pages, return the address_space, otherwise return NULL + */ +struct address_space *page_mapping_file(struct page *page) +{ + if (unlikely(PageSwapCache(page))) + return NULL; + return page_mapping(page); +} + /* Slow path of page_mapcount() for compound pages */ int __page_mapcount(struct page *page) { -- cgit v1.2.3 From 010b495e2fa32353d0ef6aa70a8169e5ef617a15 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Apr 2018 16:24:43 -0700 Subject: zsmalloc: introduce zs_huge_class_size() Patch series "zsmalloc/zram: drop zram's max_zpage_size", v3. ZRAM's max_zpage_size is a bad thing. It forces zsmalloc to store normal objects as huge ones, which results in bigger zsmalloc memory usage. Drop it and use actual zsmalloc huge-class value when decide if the object is huge or not. This patch (of 2): Not every object can be share its zspage with other objects, e.g. when the object is as big as zspage or nearly as big a zspage. For such objects zsmalloc has a so called huge class - every object which belongs to huge class consumes the entire zspage (which consists of a physical page). On x86_64, PAGE_SHIFT 12 box, the first non-huge class size is 3264, so starting down from size 3264, objects can share page(-s) and thus minimize memory wastage. ZRAM, however, has its own statically defined watermark for huge objects, namely "3 * PAGE_SIZE / 4 = 3072", and forcibly stores every object larger than this watermark (3072) as a PAGE_SIZE object, in other words, to a huge class, while zsmalloc can keep some of those objects in non-huge classes. This results in increased memory consumption. zsmalloc knows better if the object is huge or not. Introduce zs_huge_class_size() function which tells if the given object can be stored in one of non-huge classes or not. This will let us to drop ZRAM's huge object watermark and fully rely on zsmalloc when we decide if the object is huge. [sergey.senozhatsky.work@gmail.com: add pool param to zs_huge_class_size()] Link: http://lkml.kernel.org/r/20180314081833.1096-2-sergey.senozhatsky@gmail.com Link: http://lkml.kernel.org/r/20180306070639.7389-2-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zsmalloc.h | 2 ++ mm/zsmalloc.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 57a8e98f2708..2219cce81ca4 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -47,6 +47,8 @@ void zs_destroy_pool(struct zs_pool *pool); unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); void zs_free(struct zs_pool *pool, unsigned long obj); +size_t zs_huge_class_size(struct zs_pool *pool); + void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm); void zs_unmap_object(struct zs_pool *pool, unsigned long handle); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index a583ab111a43..5a532ebedc44 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -193,6 +193,7 @@ static struct vfsmount *zsmalloc_mnt; * (see: fix_fullness_group()) */ static const int fullness_threshold_frac = 4; +static size_t huge_class_size; struct size_class { spinlock_t lock; @@ -1407,6 +1408,25 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_unmap_object); +/** + * zs_huge_class_size() - Returns the size (in bytes) of the first huge + * zsmalloc &size_class. + * @pool: zsmalloc pool to use + * + * The function returns the size of the first huge class - any object of equal + * or bigger size will be stored in zspage consisting of a single physical + * page. + * + * Context: Any context. + * + * Return: the size (in bytes) of the first huge zsmalloc &size_class. + */ +size_t zs_huge_class_size(struct zs_pool *pool) +{ + return huge_class_size; +} +EXPORT_SYMBOL_GPL(zs_huge_class_size); + static unsigned long obj_malloc(struct size_class *class, struct zspage *zspage, unsigned long handle) { @@ -2363,6 +2383,27 @@ struct zs_pool *zs_create_pool(const char *name) pages_per_zspage = get_pages_per_zspage(size); objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; + /* + * We iterate from biggest down to smallest classes, + * so huge_class_size holds the size of the first huge + * class. Any object bigger than or equal to that will + * endup in the huge class. + */ + if (pages_per_zspage != 1 && objs_per_zspage != 1 && + !huge_class_size) { + huge_class_size = size; + /* + * The object uses ZS_HANDLE_SIZE bytes to store the + * handle. We need to subtract it, because zs_malloc() + * unconditionally adds handle size before it performs + * size class search - so object may be smaller than + * huge class size, yet it still can end up in the huge + * class because it grows by ZS_HANDLE_SIZE extra bytes + * right before class lookup. + */ + huge_class_size -= (ZS_HANDLE_SIZE - 1); + } + /* * size_class is used for normal zsmalloc operation such * as alloc/free for that size. Although it is natural that we -- cgit v1.2.3 From 60f5921a9a4f126e081318bd6bb2bc2798b7bba8 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Apr 2018 16:24:47 -0700 Subject: zram: drop max_zpage_size and use zs_huge_class_size() Remove ZRAM's enforced "huge object" value and use zsmalloc huge-class watermark instead, which makes more sense. TEST - I used a 1G zram device, LZO compression back-end, original data set size was 444MB. Looking at zsmalloc classes stats the test ended up to be pretty fair. BASE ZRAM/ZSMALLOC ===================== zram mm_stat 498978816 191482495 199831552 0 199831552 15634 0 zsmalloc classes class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable ... 151 2448 0 0 1240 1240 744 3 0 168 2720 0 0 4200 4200 2800 2 0 190 3072 0 0 10100 10100 7575 3 0 202 3264 0 0 380 380 304 4 0 254 4096 0 0 10620 10620 10620 1 0 Total 7 46 106982 106187 48787 0 PATCHED ZRAM/ZSMALLOC ===================== zram mm_stat 498978816 182579184 194248704 0 194248704 15628 0 zsmalloc classes class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable ... 151 2448 0 0 1240 1240 744 3 0 168 2720 0 0 4200 4200 2800 2 0 190 3072 0 0 10100 10100 7575 3 0 202 3264 0 0 7180 7180 5744 4 0 254 4096 0 0 3820 3820 3820 1 0 Total 8 45 106959 106193 47424 0 As we can see, we reduced the number of objects stored in class-4096, because a huge number of objects which we previously forcibly stored in class-4096 now stored in non-huge class-3264. This results in lower memory consumption: - zsmalloc now uses 47424 physical pages, which is less than 48787 pages zsmalloc used before. - objects that we store in class-3264 share zspages. That's why overall the number of pages that both class-4096 and class-3264 consumed went down from 10924 to 9564. [sergey.senozhatsky.work@gmail.com: add pool param to zs_huge_class_size()] Link: http://lkml.kernel.org/r/20180314081833.1096-3-sergey.senozhatsky@gmail.com Link: http://lkml.kernel.org/r/20180306070639.7389-3-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 9 ++++++++- drivers/block/zram/zram_drv.h | 16 ---------------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0afa6c8c3857..bd38f46801d3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -44,6 +44,11 @@ static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +/* + * Pages that compress to sizes equals or greater than this are stored + * uncompressed in memory. + */ +static size_t huge_class_size; static void zram_free_page(struct zram *zram, size_t index); @@ -786,6 +791,8 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) return false; } + if (!huge_class_size) + huge_class_size = zs_huge_class_size(zram->mem_pool); return true; } @@ -965,7 +972,7 @@ compress_again: return ret; } - if (unlikely(comp_len > max_zpage_size)) { + if (unlikely(comp_len >= huge_class_size)) { if (zram_wb_enabled(zram) && allow_wb) { zcomp_stream_put(zram->comp); ret = write_to_bdev(zram, bvec, index, bio, &element); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 31762db861e3..d71c8000a964 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -21,22 +21,6 @@ #include "zcomp.h" -/*-- Configurable parameters */ - -/* - * Pages that compress to size greater than this are stored - * uncompressed in memory. - */ -static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; - -/* - * NOTE: max_zpage_size must be less than or equal to: - * ZS_MAX_ALLOC_SIZE. Otherwise, zs_malloc() would - * always return failure. - */ - -/*-- End of configurable params */ - #define SECTOR_SHIFT 9 #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) -- cgit v1.2.3 From e48e3c590aac5ba9a7b6106239543c251931ba2d Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 5 Apr 2018 16:24:50 -0700 Subject: mm/nommu: remove description of alloc_vm_area The alloc_mm_area in nommu is a stub, but its description states it allocates kernel address space. Remove the description to make the code and the documentation agree. Link: http://lkml.kernel.org/r/1519585191-10180-2-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 4f8720243ae7..13723736d38f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -457,18 +457,6 @@ void __weak vmalloc_sync_all(void) { } -/** - * alloc_vm_area - allocate a range of kernel address space - * @size: size of the area - * - * Returns: NULL on failure, vm_struct on success - * - * This function reserves a range of kernel address space, and - * allocates pagetables to map that range. No actual mappings - * are created. If the kernel address space is not shared - * between processes, it syncs the pagetable across all - * processes. - */ struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); -- cgit v1.2.3 From 002843de36e18bd5be6f5bb858c0de18b6447a64 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 5 Apr 2018 16:24:53 -0700 Subject: mm/swap.c: remove @cold parameter description for release_pages() The 'cold' parameter was removed from release_pages function by commit c6f92f9fbe7d ("mm: remove cold parameter for release_pages"). Update the description to match the code. Link: http://lkml.kernel.org/r/1519585191-10180-3-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 0f17330dd0e5..3dd518832096 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -707,7 +707,6 @@ void lru_add_drain_all(void) * release_pages - batched put_page() * @pages: array of pages to release * @nr: number of pages - * @cold: whether the pages are cache cold * * Decrement the reference count on all the pages in @pages. If it * fell to zero, remove the page from the LRU and free it. -- cgit v1.2.3 From e8b098fc5747a7c871f113c9eb65453cc2d86e6f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 5 Apr 2018 16:24:57 -0700 Subject: mm: kernel-doc: add missing parameter descriptions Link: http://lkml.kernel.org/r/1519585191-10180-4-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 5 +++++ mm/compaction.c | 1 + mm/kmemleak.c | 10 ++++++++++ mm/memory_hotplug.c | 6 ++++++ mm/oom_kill.c | 2 ++ mm/pagewalk.c | 3 +++ mm/rmap.c | 1 + mm/zsmalloc.c | 2 ++ 8 files changed, 30 insertions(+) diff --git a/mm/cma.c b/mm/cma.c index 0607729abf3b..0600fc08a9f4 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -165,6 +165,9 @@ core_initcall(cma_init_reserved_areas); * @base: Base address of the reserved area * @size: Size of the reserved area (in bytes), * @order_per_bit: Order of pages represented by one bit on bitmap. + * @name: The name of the area. If this parameter is NULL, the name of + * the area will be set to "cmaN", where N is a running counter of + * used areas. * @res_cma: Pointer to store the created cma region. * * This function creates custom contiguous area from already reserved memory. @@ -227,6 +230,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, * @alignment: Alignment for the CMA area, should be power of 2 or zero * @order_per_bit: Order of pages represented by one bit on bitmap. * @fixed: hint about where to place the reserved area + * @name: The name of the area. See function cma_init_reserved_mem() * @res_cma: Pointer to store the created cma region. * * This function reserves memory from early allocator. It should be @@ -390,6 +394,7 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * @cma: Contiguous memory region for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). + * @gfp_mask: GFP mask to use during compaction * * This function allocates part of contiguous memory on specific * contiguous memory area. diff --git a/mm/compaction.c b/mm/compaction.c index a68230ab451d..88d01a50a015 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -576,6 +576,7 @@ isolate_fail: /** * isolate_freepages_range() - isolate free pages. + * @cc: Compaction control structure. * @start_pfn: The first PFN to start isolating. * @end_pfn: The one-past-last PFN. * diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 8029501dc65c..9a085d525bbc 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1187,6 +1187,11 @@ EXPORT_SYMBOL(kmemleak_no_scan); /** * kmemleak_alloc_phys - similar to kmemleak_alloc but taking a physical * address argument + * @phys: physical address of the object + * @size: size of the object + * @min_count: minimum number of references to this object. + * See kmemleak_alloc() + * @gfp: kmalloc() flags used for kmemleak internal memory allocations */ void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, gfp_t gfp) @@ -1199,6 +1204,9 @@ EXPORT_SYMBOL(kmemleak_alloc_phys); /** * kmemleak_free_part_phys - similar to kmemleak_free_part but taking a * physical address argument + * @phys: physical address if the beginning or inside an object. This + * also represents the start of the range to be freed + * @size: size to be unregistered */ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) { @@ -1210,6 +1218,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys); /** * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical * address argument + * @phys: physical address of the object */ void __ref kmemleak_not_leak_phys(phys_addr_t phys) { @@ -1221,6 +1230,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys); /** * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical * address argument + * @phys: physical address of the object */ void __ref kmemleak_ignore_phys(phys_addr_t phys) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6a9ba14e18ed..cc6dfa5832ca 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -541,6 +541,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, * @zone: zone from which pages need to be removed * @phys_start_pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) + * @altmap: alternative device page map or %NULL if default memmap is used * * Generic helper function to remove section mappings and sysfs entries * for the section of the memory we are removing. Caller needs to make @@ -1044,6 +1045,7 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) /** * try_online_node - online a node if offlined + * @nid: the node ID * * called by cpu_up() to online a node without onlined memory. */ @@ -1804,6 +1806,7 @@ static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) /** * try_offline_node + * @nid: the node ID * * Offline a node if all memory sections and cpus of the node are removed. * @@ -1847,6 +1850,9 @@ EXPORT_SYMBOL(try_offline_node); /** * remove_memory + * @nid: the node ID + * @start: physical address of the region to remove + * @size: size of the region to remove * * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call, as required by diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f2e7dfb81eee..82a92ad67af3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -185,6 +185,8 @@ static bool is_dump_unreclaim_slabs(void) * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation + * @memcg: task's memory controller, if constrained + * @nodemask: nodemask passed to page allocator for mempolicy ooms * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8d2da5dec1e0..c3084ff2569d 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -258,6 +258,9 @@ static int __walk_page_range(unsigned long start, unsigned long end, /** * walk_page_range - walk page table with caller specific callbacks + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @walk: mm_walk structure defining the callbacks and the target address space * * Recursively walk the page table tree of the process represented by @walk->mm * within the virtual address range [@start, @end). During walking, we can do diff --git a/mm/rmap.c b/mm/rmap.c index 144c66e688a9..9122787c4947 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1171,6 +1171,7 @@ void page_add_new_anon_rmap(struct page *page, /** * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock. */ diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5a532ebedc44..61cb05dc950c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -851,6 +851,7 @@ static struct page *get_next_page(struct page *page) /** * obj_to_location - get (, ) from encoded object value + * @obj: the encoded object value * @page: page object resides in zspage * @obj_idx: object index */ @@ -1301,6 +1302,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages); * zs_map_object - get address of allocated object from handle. * @pool: pool from which the object was allocated * @handle: handle returned from zs_malloc + * @mm: maping mode to use * * Before using an object allocated from zs_malloc, it must be mapped using * this function. When done with the object, it must be unmapped using -- cgit v1.2.3 From 3172485f4f8032649c144e4aafa550e1e6179332 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Thu, 5 Apr 2018 16:25:01 -0700 Subject: block_invalidatepage(): only release page if the full page was invalidated Prior to commit d47992f86b30 ("mm: change invalidatepage prototype to accept length"), an offset of 0 meant that the full page was being invalidated. After that commit, we need to instead check the length. Jan said: : : The only possible issue is that try_to_release_page() was called more : often than necessary. Otherwise the issue is harmless but still it's good : to have this fixed. Link: http://lkml.kernel.org/r/x49fu5rtnzs.fsf@segfault.boston.devel.redhat.com Fixes: d47992f86b307 ("mm: change invalidatepage prototype to accept length") Signed-off-by: Jeff Moyer Reviewed-by: Jan Kara Cc: Lukas Czerner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index 9a73924db22f..ec5dd39071e6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1511,7 +1511,7 @@ void block_invalidatepage(struct page *page, unsigned int offset, * The get_block cached value has been unconditionally invalidated, * so real IO is not possible anymore. */ - if (offset == 0) + if (length == PAGE_SIZE) try_to_release_page(page, 0); out: return; -- cgit v1.2.3 From f5c754d63d069dabe2afc6f248188b5148e08687 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 5 Apr 2018 16:25:05 -0700 Subject: mm/swap_state.c: make bool enable_vma_readahead and swap_vma_readahead() static The bool enable_vma_readahead and swap_vma_readahead() are local to the source and do not need to be in global scope, so make them static. Cleans up sparse warnings: mm/swap_state.c:41:6: warning: symbol 'enable_vma_readahead' was not declared. Should it be static? mm/swap_state.c:742:13: warning: symbol 'swap_vma_readahead' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20180223164852.5159-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Acked-by: "Huang, Ying" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index b97da97b6846..f233dccd3b1b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -38,7 +38,7 @@ static const struct address_space_operations swap_aops = { struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; -bool enable_vma_readahead __read_mostly = true; +static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) @@ -733,8 +733,8 @@ static void swap_ra_info(struct vm_fault *vmf, pte_unmap(orig_pte); } -struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, - struct vm_fault *vmf) +static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, + struct vm_fault *vmf) { struct blk_plug plug; struct vm_area_struct *vma = vmf->vma; -- cgit v1.2.3 From 0c7c1bed7e13dbb545375c231e6ba1dca5e8d725 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 5 Apr 2018 16:25:08 -0700 Subject: mm: make counting of list_lru_one::nr_items lockless During the reclaiming slab of a memcg, shrink_slab iterates over all registered shrinkers in the system, and tries to count and consume objects related to the cgroup. In case of memory pressure, this behaves bad: I observe high system time and time spent in list_lru_count_one() for many processes on RHEL7 kernel. This patch makes list_lru_node::memcg_lrus rcu protected, that allows to skip taking spinlock in list_lru_count_one(). Shakeel Butt with the patch observes significant perf graph change. He says: ======================================================================== Setup: running a fork-bomb in a memcg of 200MiB on a 8GiB and 4 vcpu VM and recording the trace with 'perf record -g -a'. The trace without the patch: + 34.19% fb.sh [kernel.kallsyms] [k] queued_spin_lock_slowpath + 30.77% fb.sh [kernel.kallsyms] [k] _raw_spin_lock + 3.53% fb.sh [kernel.kallsyms] [k] list_lru_count_one + 2.26% fb.sh [kernel.kallsyms] [k] super_cache_count + 1.68% fb.sh [kernel.kallsyms] [k] shrink_slab + 0.59% fb.sh [kernel.kallsyms] [k] down_read_trylock + 0.48% fb.sh [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore + 0.38% fb.sh [kernel.kallsyms] [k] shrink_node_memcg + 0.32% fb.sh [kernel.kallsyms] [k] queue_work_on + 0.26% fb.sh [kernel.kallsyms] [k] count_shadow_nodes With the patch: + 0.16% swapper [kernel.kallsyms] [k] default_idle + 0.13% oom_reaper [kernel.kallsyms] [k] mutex_spin_on_owner + 0.05% perf [kernel.kallsyms] [k] copy_user_generic_string + 0.05% init.real [kernel.kallsyms] [k] wait_consider_task + 0.05% kworker/0:0 [kernel.kallsyms] [k] finish_task_switch + 0.04% kworker/2:1 [kernel.kallsyms] [k] finish_task_switch + 0.04% kworker/3:1 [kernel.kallsyms] [k] finish_task_switch + 0.04% kworker/1:0 [kernel.kallsyms] [k] finish_task_switch + 0.03% binary [kernel.kallsyms] [k] copy_page ======================================================================== Thanks Shakeel for the testing. [ktkhai@virtuozzo.com: v2] Link: http://lkml.kernel.org/r/151203869520.3915.2587549826865799173.stgit@localhost.localdomain Link: http://lkml.kernel.org/r/150583358557.26700.8490036563698102569.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Tested-by: Shakeel Butt Acked-by: Vladimir Davydov Cc: Andrey Ryabinin Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_lru.h | 3 ++- mm/list_lru.c | 67 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index bb8129a3474d..96def9d15b1b 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -32,6 +32,7 @@ struct list_lru_one { }; struct list_lru_memcg { + struct rcu_head rcu; /* array of per cgroup lists, indexed by memcg_cache_id */ struct list_lru_one *lru[0]; }; @@ -43,7 +44,7 @@ struct list_lru_node { struct list_lru_one lru; #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ - struct list_lru_memcg *memcg_lrus; + struct list_lru_memcg __rcu *memcg_lrus; #endif long nr_items; } ____cacheline_aligned_in_smp; diff --git a/mm/list_lru.c b/mm/list_lru.c index fd41e969ede5..fcfb6c89ed47 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -52,14 +52,15 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru) static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) { + struct list_lru_memcg *memcg_lrus; /* - * The lock protects the array of per cgroup lists from relocation - * (see memcg_update_list_lru_node). + * Either lock or RCU protects the array of per cgroup lists + * from relocation (see memcg_update_list_lru_node). */ - lockdep_assert_held(&nlru->lock); - if (nlru->memcg_lrus && idx >= 0) - return nlru->memcg_lrus->lru[idx]; - + memcg_lrus = rcu_dereference_check(nlru->memcg_lrus, + lockdep_is_held(&nlru->lock)); + if (memcg_lrus && idx >= 0) + return memcg_lrus->lru[idx]; return &nlru->lru; } @@ -168,10 +169,10 @@ static unsigned long __list_lru_count_one(struct list_lru *lru, struct list_lru_one *l; unsigned long count; - spin_lock(&nlru->lock); + rcu_read_lock(); l = list_lru_from_memcg_idx(nlru, memcg_idx); count = l->nr_items; - spin_unlock(&nlru->lock); + rcu_read_unlock(); return count; } @@ -324,24 +325,41 @@ fail: static int memcg_init_list_lru_node(struct list_lru_node *nlru) { + struct list_lru_memcg *memcg_lrus; int size = memcg_nr_cache_ids; - nlru->memcg_lrus = kvmalloc(size * sizeof(void *), GFP_KERNEL); - if (!nlru->memcg_lrus) + memcg_lrus = kvmalloc(sizeof(*memcg_lrus) + + size * sizeof(void *), GFP_KERNEL); + if (!memcg_lrus) return -ENOMEM; - if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { - kvfree(nlru->memcg_lrus); + if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) { + kvfree(memcg_lrus); return -ENOMEM; } + RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus); return 0; } static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) { - __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); - kvfree(nlru->memcg_lrus); + struct list_lru_memcg *memcg_lrus; + /* + * This is called when shrinker has already been unregistered, + * and nobody can use it. So, there is no need to use kvfree_rcu(). + */ + memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true); + __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids); + kvfree(memcg_lrus); +} + +static void kvfree_rcu(struct rcu_head *head) +{ + struct list_lru_memcg *mlru; + + mlru = container_of(head, struct list_lru_memcg, rcu); + kvfree(mlru); } static int memcg_update_list_lru_node(struct list_lru_node *nlru, @@ -351,8 +369,9 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, BUG_ON(old_size > new_size); - old = nlru->memcg_lrus; - new = kvmalloc(new_size * sizeof(void *), GFP_KERNEL); + old = rcu_dereference_protected(nlru->memcg_lrus, + lockdep_is_held(&list_lrus_mutex)); + new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL); if (!new) return -ENOMEM; @@ -361,29 +380,33 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, return -ENOMEM; } - memcpy(new, old, old_size * sizeof(void *)); + memcpy(&new->lru, &old->lru, old_size * sizeof(void *)); /* - * The lock guarantees that we won't race with a reader - * (see list_lru_from_memcg_idx). + * The locking below allows readers that hold nlru->lock avoid taking + * rcu_read_lock (see list_lru_from_memcg_idx). * * Since list_lru_{add,del} may be called under an IRQ-safe lock, * we have to use IRQ-safe primitives here to avoid deadlock. */ spin_lock_irq(&nlru->lock); - nlru->memcg_lrus = new; + rcu_assign_pointer(nlru->memcg_lrus, new); spin_unlock_irq(&nlru->lock); - kvfree(old); + call_rcu(&old->rcu, kvfree_rcu); return 0; } static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, int old_size, int new_size) { + struct list_lru_memcg *memcg_lrus; + + memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, + lockdep_is_held(&list_lrus_mutex)); /* do not bother shrinking the array back to the old size, because we * cannot handle allocation failures here */ - __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); + __memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size); } static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) -- cgit v1.2.3 From 3eda69c92d4751977baf2d34e88a29d4b6affa7d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 5 Apr 2018 16:25:12 -0700 Subject: kernel/fork.c: detect early free of a live mm KASAN splats indicate that in some cases we free a live mm, then continue to access it, with potentially disastrous results. This is likely due to a mismatched mmdrop() somewhere in the kernel, but so far the culprit remains elusive. Let's have __mmdrop() verify that the mm isn't live for the current task, similar to the existing check for init_mm. This way, we can catch this class of issue earlier, and without requiring KASAN. Currently, idle_task_exit() leaves active_mm stale after it switches to init_mm. This isn't harmful, but will trigger the new assertions, so we must adjust idle_task_exit() to update active_mm. Link: http://lkml.kernel.org/r/20180312140103.19235-1-mark.rutland@arm.com Signed-off-by: Mark Rutland Reviewed-by: Andrew Morton Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ kernel/sched/core.c | 1 + 2 files changed, 3 insertions(+) diff --git a/kernel/fork.c b/kernel/fork.c index f71b67dc156d..242c8c93d285 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -595,6 +595,8 @@ static void check_mm(struct mm_struct *mm) void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); + WARN_ON_ONCE(mm == current->mm); + WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); hmm_mm_destroy(mm); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28b68995a417..e8afd6086f23 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5560,6 +5560,7 @@ void idle_task_exit(void) if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + current->active_mm = &init_mm; finish_arch_post_lock_switch(); } mmdrop(mm); -- cgit v1.2.3 From 5ecd9d403ad081ed2de7b118c1e96124d4e0ba6c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Apr 2018 16:25:16 -0700 Subject: mm, page_alloc: wakeup kcompactd even if kswapd cannot free more memory Kswapd will not wakeup if per-zone watermarks are not failing or if too many previous attempts at background reclaim have failed. This can be true if there is a lot of free memory available. For high- order allocations, kswapd is responsible for waking up kcompactd for background compaction. If the zone is not below its watermarks or reclaim has recently failed (lots of free memory, nothing left to reclaim), kcompactd does not get woken up. When __GFP_DIRECT_RECLAIM is not allowed, allow kcompactd to still be woken up even if kswapd will not reclaim. This allows high-order allocations, such as thp, to still trigger background compaction even when the zone has an abundance of free memory. Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1803111659420.209721@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../trace/postprocess/trace-vmscan-postprocess.pl | 4 +-- include/linux/mmzone.h | 3 +- include/trace/events/vmscan.h | 17 +++++++----- mm/page_alloc.c | 14 ++++++---- mm/vmscan.c | 32 ++++++++++++++++------ 5 files changed, 45 insertions(+), 25 deletions(-) diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index ba976805853a..66bfd8396877 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -111,7 +111,7 @@ my $regex_direct_begin_default = 'order=([0-9]*) may_writepage=([0-9]*) gfp_flag my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)'; my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; -my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)'; +my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)'; my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; @@ -201,7 +201,7 @@ $regex_kswapd_sleep = generate_traceevent_regex( $regex_wakeup_kswapd = generate_traceevent_regex( "vmscan/mm_vmscan_wakeup_kswapd", $regex_wakeup_kswapd_default, - "nid", "zid", "order"); + "nid", "zid", "order", "gfp_flags"); $regex_lru_isolate = generate_traceevent_regex( "vmscan/mm_vmscan_lru_isolate", $regex_lru_isolate_default, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5d935411d3c4..f11ae29005f1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -776,7 +776,8 @@ static inline bool is_dev_zone(const struct zone *zone) #include void build_all_zonelists(pg_data_t *pgdat); -void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); +void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, + enum zone_type classzone_idx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags, long free_pages); diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index e0b8b9173e1c..6570c5b45ba1 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -78,26 +78,29 @@ TRACE_EVENT(mm_vmscan_kswapd_wake, TRACE_EVENT(mm_vmscan_wakeup_kswapd, - TP_PROTO(int nid, int zid, int order), + TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags), - TP_ARGS(nid, zid, order), + TP_ARGS(nid, zid, order, gfp_flags), TP_STRUCT__entry( - __field( int, nid ) - __field( int, zid ) - __field( int, order ) + __field( int, nid ) + __field( int, zid ) + __field( int, order ) + __field( gfp_t, gfp_flags ) ), TP_fast_assign( __entry->nid = nid; __entry->zid = zid; __entry->order = order; + __entry->gfp_flags = gfp_flags; ), - TP_printk("nid=%d zid=%d order=%d", + TP_printk("nid=%d zid=%d order=%d gfp_flags=%s", __entry->nid, __entry->zid, - __entry->order) + __entry->order, + show_gfp_flags(__entry->gfp_flags)) ); DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f6005b7c3446..02c1a60d7937 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3805,16 +3805,18 @@ retry: return page; } -static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) +static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, + const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; + enum zone_type high_zoneidx = ac->high_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->high_zoneidx, ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, order, ac->high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -4093,7 +4095,7 @@ retry_cpuset: goto nopage; if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); + wake_all_kswapds(order, gfp_mask, ac); /* * The adjusted alloc_flags might result in immediate success, so try @@ -4151,7 +4153,7 @@ retry_cpuset: retry: /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); + wake_all_kswapds(order, gfp_mask, ac); reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) diff --git a/mm/vmscan.c b/mm/vmscan.c index 976be140a8ce..4390a8d5be41 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3539,16 +3539,21 @@ kswapd_try_sleep: } /* - * A zone is low on free memory, so wake its kswapd task to service it. + * A zone is low on free memory or too fragmented for high-order memory. If + * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's + * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim + * has failed or is not needed, still wake up kcompactd if only compaction is + * needed. */ -void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) +void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, + enum zone_type classzone_idx) { pg_data_t *pgdat; if (!managed_zone(zone)) return; - if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) + if (!cpuset_zone_allowed(zone, gfp_flags)) return; pgdat = zone->zone_pgdat; pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, @@ -3557,14 +3562,23 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!waitqueue_active(&pgdat->kswapd_wait)) return; - /* Hopeless node, leave it to direct reclaim */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) - return; - - if (pgdat_balanced(pgdat, order, classzone_idx)) + /* Hopeless node, leave it to direct reclaim if possible */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || + pgdat_balanced(pgdat, order, classzone_idx)) { + /* + * There may be plenty of free memory available, but it's too + * fragmented for high-order allocations. Wake up kcompactd + * and rely on compaction_suitable() to determine if it's + * needed. If it fails, it will defer subsequent attempts to + * ratelimit its work. + */ + if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) + wakeup_kcompactd(pgdat, order, classzone_idx); return; + } - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order, + gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } -- cgit v1.2.3 From d46078b2888947a86b6bb997cd5927e602e8fdc9 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Apr 2018 16:25:19 -0700 Subject: mm, oom: remove 3% bonus for CAP_SYS_ADMIN processes Since the 2.6 kernel, the oom killer has slightly biased away from CAP_SYS_ADMIN processes by discounting some of its memory usage in comparison to other processes. This has always been implicit and nothing exactly relies on the behavior. Gaurav notices that __task_cred() can dereference a potentially freed pointer if the task under consideration is exiting because a reference to the task_struct is not held. Remove the CAP_SYS_ADMIN bias so that all processes are treated equally. If any CAP_SYS_ADMIN process would like to be biased against, it is always allowed to adjust /proc/pid/oom_score_adj. Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1803071548510.6996@chino.kir.corp.google.com Signed-off-by: David Rientjes Reported-by: Gaurav Kohli Acked-by: Michal Hocko Cc: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 82a92ad67af3..c4c9ecd17c6e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -226,13 +226,6 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); - /* - * Root processes get 3% bonus, just like the __vm_enough_memory() - * implementation used by LSMs. - */ - if (has_capability_noaudit(p, CAP_SYS_ADMIN)) - points -= (points * 3) / 100; - /* Normalize to oom_score_adj units */ adj *= totalpages / 1000; points += adj; -- cgit v1.2.3 From 1c8f422059ae5da07db7406ab916203f9417e396 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 5 Apr 2018 16:25:23 -0700 Subject: mm: change return type to vm_fault_t The plan for these patches is to introduce the typedef, initially just as documentation ("These functions should return a VM_FAULT_ status"). We'll trickle the patches to individual drivers/filesystems in through the maintainers, as far as possible. Then we'll change the typedef to an unsigned int and break the compilation of any unconverted drivers/filesystems. vmf_insert_page(), vmf_insert_mixed() and vmf_insert_pfn() are three newly added functions. The various drivers/filesystems where return value of fault(), huge_fault(), page_mkwrite() and pfn_mkwrite() get converted, will need them. These functions will return correct VM_FAULT_ code based on err value. We've had bugs before where drivers returned -EFOO. And we have this silly inefficiency where vm_insert_xxx() return an errno which (afaict) every driver then converts into a VM_FAULT code. In many cases drivers failed to return correct VM_FAULT code value despite of vm_insert_xxx() fails. We have indentified and clean up all those existing bugs and silly inefficiencies in driver/filesystems by adding these three new inline wrappers. As mentioned above, we will trickle those patches to individual drivers/filesystems in through maintainers after these three wrapper functions are merged. Eventually we can convert vm_insert_xxx() into vmf_insert_xxx() and remove these inline wrappers, but these are a good intermediate step. Link: http://lkml.kernel.org/r/20180310162351.GA7422@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Acked-by: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 47 +++++++++++++++++++++++++++++++++++++++++++---- include/linux/mm_types.h | 2 ++ 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 88d82ba29d72..3ad632366973 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -386,18 +386,19 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*split)(struct vm_area_struct * area, unsigned long addr); int (*mremap)(struct vm_area_struct * area); - int (*fault)(struct vm_fault *vmf); - int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); + vm_fault_t (*fault)(struct vm_fault *vmf); + vm_fault_t (*huge_fault)(struct vm_fault *vmf, + enum page_entry_size pe_size); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ - int (*page_mkwrite)(struct vm_fault *vmf); + vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ - int (*pfn_mkwrite)(struct vm_fault *vmf); + vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware @@ -2424,6 +2425,44 @@ int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); +static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + int err = vm_insert_page(vma, addr, page); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} + +static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn) +{ + int err = vm_insert_mixed(vma, addr, pfn); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} + +static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn) +{ + int err = vm_insert_pfn(vma, addr, pfn); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fd1af6b9591d..21612347d311 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -22,6 +22,8 @@ #endif #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) +typedef int vm_fault_t; + struct address_space; struct mem_cgroup; struct hmm; -- cgit v1.2.3 From 2c7452a075d4db2dc8e7a61369ede1afc05cd63e Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 5 Apr 2018 16:25:26 -0700 Subject: mm/page_isolation.c: make start_isolate_page_range() fail if already isolated start_isolate_page_range() is used to set the migrate type of a set of pageblocks to MIGRATE_ISOLATE while attempting to start a migration operation. It assumes that only one thread is calling it for the specified range. This routine is used by CMA, memory hotplug and gigantic huge pages. Each of these users synchronize access to the range within their subsystem. However, two subsystems (CMA and gigantic huge pages for example) could attempt operations on the same range. If this happens, one thread may 'undo' the work another thread is doing. This can result in pageblocks being incorrectly left marked as MIGRATE_ISOLATE and therefore not available for page allocation. What is ideally needed is a way to synchronize access to a set of pageblocks that are undergoing isolation and migration. The only thing we know about these pageblocks is that they are all in the same zone. A per-node mutex is too coarse as we want to allow multiple operations on different ranges within the same zone concurrently. Instead, we will use the migration type of the pageblocks themselves as a form of synchronization. start_isolate_page_range sets the migration type on a set of page- blocks going in order from the one associated with the smallest pfn to the largest pfn. The zone lock is acquired to check and set the migration type. When going through the list of pageblocks check if MIGRATE_ISOLATE is already set. If so, this indicates another thread is working on this pageblock. We know exactly which pageblocks we set, so clean up by undo those and return -EBUSY. This allows start_isolate_page_range to serve as a synchronization mechanism and will allow for more general use of callers making use of these interfaces. Update comments in alloc_contig_range to reflect this new functionality. Each CPU holds the associated zone lock to modify or examine the migration type of a pageblock. And, it will only examine/update a single pageblock per lock acquire/release cycle. Link: http://lkml.kernel.org/r/20180309224731.16978-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Andrew Morton Cc: KAMEZAWA Hiroyuki Cc: Luiz Capitulino Cc: Michal Nazarewicz Cc: Michal Hocko Cc: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- mm/page_isolation.c | 18 +++++++++++++++++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 02c1a60d7937..0b97b8ece4a9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7765,11 +7765,11 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * @gfp_mask: GFP mask to use during compaction * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES - * aligned, however it's the caller's responsibility to guarantee that - * we are the only thread that changes migrate type of pageblocks the - * pages fall in. + * aligned. The PFN range must belong to a single zone. * - * The PFN range must belong to a single zone. + * The first thing this routine does is attempt to MIGRATE_ISOLATE all + * pageblocks in the range. Once isolated, the pageblocks should not + * be modified by others. * * Returns zero on success or negative error code. On success all * pages which PFN is in [start, end) are allocated for the caller and diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 165ed8117bd1..61dee77bb211 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -28,6 +28,14 @@ static int set_migratetype_isolate(struct page *page, int migratetype, spin_lock_irqsave(&zone->lock, flags); + /* + * We assume the caller intended to SET migrate type to isolate. + * If it is already set, then someone else must have raced and + * set it before us. Return -EBUSY + */ + if (is_migrate_isolate_page(page)) + goto out; + pfn = page_to_pfn(page); arg.start_pfn = pfn; arg.nr_pages = pageblock_nr_pages; @@ -166,7 +174,15 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * future will not be allocated again. * * start_pfn/end_pfn must be aligned to pageblock_order. - * Returns 0 on success and -EBUSY if any part of range cannot be isolated. + * Return 0 on success and -EBUSY if any part of range cannot be isolated. + * + * There is no high level synchronization mechanism that prevents two threads + * from trying to isolate overlapping ranges. If this happens, one thread + * will notice pageblocks in the overlapping range already set to isolate. + * This happens in set_migratetype_isolate, and set_migratetype_isolate + * returns an error. We then clean up by restoring the migration type on + * pageblocks we may have modified and return -EBUSY to caller. This + * prevents two threads from simultaneously working on overlapping ranges. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype, bool skip_hwpoisoned_pages) -- cgit v1.2.3 From 91241681c62a5a690c88eb2aca027f094125eaac Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 5 Apr 2018 16:25:30 -0700 Subject: include/linux/mmdebug.h: make VM_WARN* non-rvals At present the construct if (VM_WARN(...)) will compile OK with CONFIG_DEBUG_VM=y and will fail with CONFIG_DEBUG_VM=n. The reason is that VM_{WARN,BUG}* have always been special wrt. {WARN/BUG}* and never generate any code when DEBUG_VM is disabled. So we cannot really use it in conditionals. We considered changing things so that this construct works in both cases but that might cause unwanted code generation with CONFIG_DEBUG_VM=n. It is safer and simpler to make the build fail in both cases. [akpm@linux-foundation.org: changelog] Signed-off-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmdebug.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 57b0030d3800..2ad72d2c8cc5 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -37,10 +37,10 @@ void dump_mm(const struct mm_struct *mm); BUG(); \ } \ } while (0) -#define VM_WARN_ON(cond) WARN_ON(cond) -#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) -#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) -#define VM_WARN(cond, format...) WARN(cond, format) +#define VM_WARN_ON(cond) (void)WARN_ON(cond) +#define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) +#define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format) +#define VM_WARN(cond, format...) (void)WARN(cond, format) #else #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) -- cgit v1.2.3 From 514c60324960137e74457fdc233a339b985fa8a8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 5 Apr 2018 16:25:34 -0700 Subject: headers: untangle kmemleak.h from mm.h Currently #includes for no obvious reason. It looks like it's only a convenience, so remove kmemleak.h from slab.h and add to any users of kmemleak_* that don't already #include it. Also remove from source files that do not use it. This is tested on i386 allmodconfig and x86_64 allmodconfig. It would be good to run it through the 0day bot for other $ARCHes. I have neither the horsepower nor the storage space for the other $ARCHes. Update: This patch has been extensively build-tested by both the 0day bot & kisskb/ozlabs build farms. Both of them reported 2 build failures for which patches are included here (in v2). [ slab.h is the second most used header file after module.h; kernel.h is right there with slab.h. There could be some minor error in the counting due to some #includes having comments after them and I didn't combine all of those. ] [akpm@linux-foundation.org: security/keys/big_key.c needs vmalloc.h, per sfr] Link: http://lkml.kernel.org/r/e4309f98-3749-93e1-4bb7-d9501a39d015@infradead.org Link: http://kisskb.ellerman.id.au/kisskb/head/13396/ Signed-off-by: Randy Dunlap Reviewed-by: Ingo Molnar Reported-by: Michael Ellerman [2 build failures] Reported-by: Fengguang Wu [2 build failures] Reviewed-by: Andrew Morton Cc: Wei Yongjun Cc: Luis R. Rodriguez Cc: Greg Kroah-Hartman Cc: Mimi Zohar Cc: John Johansen Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/sysdev/dart_iommu.c | 1 + arch/powerpc/sysdev/msi_bitmap.c | 1 + arch/s390/kernel/nmi.c | 2 +- arch/s390/kernel/smp.c | 1 - arch/sparc/kernel/irq_64.c | 1 - arch/x86/kernel/pci-dma.c | 1 - drivers/iommu/exynos-iommu.c | 1 + drivers/iommu/mtk_iommu_v1.c | 1 - drivers/net/ethernet/ti/cpsw.c | 1 + drivers/net/wireless/realtek/rtlwifi/pci.c | 1 - drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c | 1 - drivers/staging/rtl8188eu/hal/fw.c | 2 +- drivers/staging/rtlwifi/pci.c | 1 - drivers/virtio/virtio_ring.c | 1 - include/linux/slab.h | 1 - kernel/ucount.c | 1 + lib/test_firmware.c | 1 + mm/cma.c | 1 + mm/memblock.c | 1 + net/core/sysctl_net_core.c | 1 - net/ipv4/route.c | 1 - security/apparmor/lsm.c | 1 - security/keys/big_key.c | 1 + 23 files changed, 11 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index a6198d4f0f03..5ca3e22d0512 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index c4dae27172b3..6243a7e537d0 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index c7a627620e5e..8c867b43c8eb 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index a4a9fe1934e9..2f8f7d7dd9a8 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index d66dde833f5e..713670e6d13d 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 14437116ffea..77625b60a510 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index 2138102ef611..c5f4f7691b57 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 542930cd183d..5a96fd14ac22 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 1b4af54a4968..30371274409d 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -35,6 +35,7 @@ #include #include #include +#include #include diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index 2437422625bf..57bb8f049e59 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -31,7 +31,6 @@ #include "efuse.h" #include #include -#include #include MODULE_AUTHOR("lizhaoming "); diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c index 015476e3f7e5..f3bff66e85d0 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c @@ -32,7 +32,6 @@ #include "../rtl8192ce/def.h" #include "fw_common.h" #include -#include static void _rtl92c_enable_fw_download(struct ieee80211_hw *hw, bool enable) { diff --git a/drivers/staging/rtl8188eu/hal/fw.c b/drivers/staging/rtl8188eu/hal/fw.c index 03d091bad13a..6b67b38a6a9f 100644 --- a/drivers/staging/rtl8188eu/hal/fw.c +++ b/drivers/staging/rtl8188eu/hal/fw.c @@ -30,7 +30,7 @@ #include "rtl8188e_hal.h" #include -#include +#include static void _rtl88e_enable_fw_download(struct adapter *adapt, bool enable) { diff --git a/drivers/staging/rtlwifi/pci.c b/drivers/staging/rtlwifi/pci.c index 70a64a5f564a..d56810eabde7 100644 --- a/drivers/staging/rtlwifi/pci.c +++ b/drivers/staging/rtlwifi/pci.c @@ -31,7 +31,6 @@ #include "efuse.h" #include #include -#include #include MODULE_AUTHOR("lizhaoming "); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 71458f493cf8..21d464a29cf8 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/include/linux/slab.h b/include/linux/slab.h index 04402c637171..81ebd71f8c03 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -125,7 +125,6 @@ #define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \ (unsigned long)ZERO_SIZE_PTR) -#include #include struct mem_cgroup; diff --git a/kernel/ucount.c b/kernel/ucount.c index b4eeee03934f..f48d1b6376a4 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #define UCOUNTS_HASHTABLE_BITS 10 diff --git a/lib/test_firmware.c b/lib/test_firmware.c index 078a61480573..cee000ac54d8 100644 --- a/lib/test_firmware.c +++ b/lib/test_firmware.c @@ -21,6 +21,7 @@ #include #include #include +#include #define TEST_FIRMWARE_NAME "test-firmware.bin" #define TEST_FIRMWARE_NUM_REQS 4 diff --git a/mm/cma.c b/mm/cma.c index 0600fc08a9f4..5809bbe360d7 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "cma.h" diff --git a/mm/memblock.c b/mm/memblock.c index 04406a930bfc..ecc6cb58cd33 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b3b609f0eeb5..b1a2c5e38530 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8322e479f299..594a1c605c92 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -108,7 +108,6 @@ #include #ifdef CONFIG_SYSCTL #include -#include #endif #include #include diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 9a65eeaf7dfa..6134302c143c 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include "include/apparmor.h" diff --git a/security/keys/big_key.c b/security/keys/big_key.c index fa728f662a6f..933623784ccd 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 644d87dccdc69cf79834a72ed0c889580d6af32a Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Thu, 5 Apr 2018 16:25:38 -0700 Subject: mm/memblock.c: cast constant ULLONG_MAX to phys_addr_t This fixes a warning shown when phys_addr_t is 32-bit int when compiling with clang: mm/memblock.c:927:15: warning: implicit conversion from 'unsigned long long' to 'phys_addr_t' (aka 'unsigned int') changes value from 18446744073709551615 to 4294967295 [-Wconstant-conversion] r->base : ULLONG_MAX; ^~~~~~~~~~ ./include/linux/kernel.h:30:21: note: expanded from macro 'ULLONG_MAX' #define ULLONG_MAX (~0ULL) ^~~~~ Link: http://lkml.kernel.org/r/20180319005645.29051-1-stefan@agner.ch Signed-off-by: Stefan Agner Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Catalin Marinas Cc: Pavel Tatashin Cc: Ard Biesheuvel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index ecc6cb58cd33..9b04568ad42a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -925,7 +925,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : ULLONG_MAX; + r->base : (phys_addr_t)ULLONG_MAX; /* * if idx_b advanced past idx_a, @@ -1041,7 +1041,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : ULLONG_MAX; + r->base : (phys_addr_t)ULLONG_MAX; /* * if idx_b advanced past idx_a, * break out to advance idx_a -- cgit v1.2.3 From 77da2ba0648a4fd52e5ff97b8b2b8dd312aec4b0 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 5 Apr 2018 16:25:41 -0700 Subject: mm/ksm: fix interaction with THP This patch fixes a corner case for KSM. When two pages belong or belonged to the same transparent hugepage, and they should be merged, KSM fails to split the page, and therefore no merging happens. This bug can be reproduced by: * making sure ksm is running (in case disabling ksmtuned) * enabling transparent hugepages * allocating a THP-aligned 1-THP-sized buffer e.g. on amd64: posix_memalign(&p, 1<<21, 1<<21) * filling it with the same values e.g. memset(p, 42, 1<<21) * performing madvise to make it mergeable e.g. madvise(p, 1<<21, MADV_MERGEABLE) * waiting for KSM to perform a few scans The expected outcome is that the all the pages get merged (1 shared and the rest sharing); the actual outcome is that no pages get merged (1 unshared and the rest volatile) The reason of this behaviour is that we increase the reference count once for both pages we want to merge, but if they belong to the same hugepage (or compound page), the reference counter used in both cases is the one of the head of the compound page. This means that split_huge_page will find a value of the reference counter too high and will fail. This patch solves this problem by testing if the two pages to merge belong to the same hugepage when attempting to merge them. If so, the hugepage is split safely. This means that the hugepage is not split if not necessary. Link: http://lkml.kernel.org/r/1521548069-24758-1-git-send-email-imbrenda@linux.vnet.ibm.com Signed-off-by: Claudio Imbrenda Co-authored-by: Gerald Schaefer Reviewed-by: Andrew Morton Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/mm/ksm.c b/mm/ksm.c index 7596e14bc233..e8d6c6210b80 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2082,8 +2082,22 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { + bool split; + kpage = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); + /* + * If both pages we tried to merge belong to the same compound + * page, then we actually ended up increasing the reference + * count of the same compound page twice, and split_huge_page + * failed. + * Here we set a flag if that happened, and we use it later to + * try split_huge_page again. Since we call put_page right + * afterwards, the reference count will be correct and + * split_huge_page should succeed. + */ + split = PageTransCompound(page) + && compound_head(page) == compound_head(tree_page); put_page(tree_page); if (kpage) { /* @@ -2110,6 +2124,20 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) break_cow(tree_rmap_item); break_cow(rmap_item); } + } else if (split) { + /* + * We are here if we tried to merge two pages and + * failed because they both belonged to the same + * compound page. We will split the page now, but no + * merging will take place. + * We do not want to add the cost of a full lock; if + * the page is locked, it is better to skip it and + * perhaps try again later. + */ + if (!trylock_page(page)) + return; + split_huge_page(page); + unlock_page(page); } } } -- cgit v1.2.3 From 97b1255cb27c551d7c3c5c496d787da40772da99 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 5 Apr 2018 16:25:45 -0700 Subject: mm,oom_reaper: check for MMF_OOM_SKIP before complaining I got "oom_reaper: unable to reap pid:" messages when the victim thread was blocked inside free_pgtables() (which occurred after returning from unmap_vmas() and setting MMF_OOM_SKIP). We don't need to complain when exit_mmap() already set MMF_OOM_SKIP. Killed process 7558 (a.out) total-vm:4176kB, anon-rss:84kB, file-rss:0kB, shmem-rss:0kB oom_reaper: unable to reap pid:7558 (a.out) a.out D13272 7558 6931 0x00100084 Call Trace: schedule+0x2d/0x80 rwsem_down_write_failed+0x2bb/0x440 call_rwsem_down_write_failed+0x13/0x20 down_write+0x49/0x60 unlink_file_vma+0x28/0x50 free_pgtables+0x36/0x100 exit_mmap+0xbb/0x180 mmput+0x50/0x110 copy_process.part.41+0xb61/0x1fe0 _do_fork+0xe6/0x560 do_syscall_64+0x74/0x230 entry_SYSCALL_64_after_hwframe+0x42/0xb7 Link: http://lkml.kernel.org/r/201803221946.DHG65638.VFJHFtOSQLOMOF@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: David Rientjes Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c4c9ecd17c6e..ff992fa8760a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -590,7 +590,8 @@ static void oom_reap_task(struct task_struct *tsk) while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); - if (attempts <= MAX_OOM_REAP_RETRIES) + if (attempts <= MAX_OOM_REAP_RETRIES || + test_bit(MMF_OOM_SKIP, &mm->flags)) goto done; -- cgit v1.2.3