From 9206a3af4fc0cebbefca2d79876d279bdd8d582b Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 17 Dec 2021 13:55:58 +0200 Subject: clk: ti: Move dra7 clock devices out of the legacy section I accidentally added some dra7 clock defines to the legacy section that we want to stop using. Let's move the defines to the right location. Note that this is just a cosmetic fix. Cc: linux-clk@vger.kernel.org Cc: Stephen Boyd Cc: Tero Kristo Acked-by: Rob Herring Signed-off-by: Tony Lindgren --- include/dt-bindings/clock/dra7.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/clock/dra7.h b/include/dt-bindings/clock/dra7.h index 7d57063b8a65..29ff6b895848 100644 --- a/include/dt-bindings/clock/dra7.h +++ b/include/dt-bindings/clock/dra7.h @@ -84,17 +84,10 @@ #define DRA7_L3_MAIN_2_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) #define DRA7_L3_INSTR_CLKCTRL DRA7_CLKCTRL_INDEX(0x28) -/* iva clocks */ -#define DRA7_IVA_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) -#define DRA7_SL2IF_CLKCTRL DRA7_CLKCTRL_INDEX(0x28) - /* dss clocks */ #define DRA7_DSS_CORE_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) #define DRA7_BB2D_CLKCTRL DRA7_CLKCTRL_INDEX(0x30) -/* gpu clocks */ -#define DRA7_GPU_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) - /* l3init clocks */ #define DRA7_MMC1_CLKCTRL DRA7_CLKCTRL_INDEX(0x28) #define DRA7_MMC2_CLKCTRL DRA7_CLKCTRL_INDEX(0x30) @@ -267,10 +260,17 @@ #define DRA7_L3INSTR_L3_MAIN_2_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) #define DRA7_L3INSTR_L3_INSTR_CLKCTRL DRA7_CLKCTRL_INDEX(0x28) +/* iva clocks */ +#define DRA7_IVA_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) +#define DRA7_SL2IF_CLKCTRL DRA7_CLKCTRL_INDEX(0x28) + /* dss clocks */ #define DRA7_DSS_DSS_CORE_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) #define DRA7_DSS_BB2D_CLKCTRL DRA7_CLKCTRL_INDEX(0x30) +/* gpu clocks */ +#define DRA7_GPU_CLKCTRL DRA7_CLKCTRL_INDEX(0x20) + /* l3init clocks */ #define DRA7_L3INIT_MMC1_CLKCTRL DRA7_CLKCTRL_INDEX(0x28) #define DRA7_L3INIT_MMC2_CLKCTRL DRA7_CLKCTRL_INDEX(0x30) -- cgit v1.2.3 From 5298d4bfe80f6ae6ae2777bcd1357b0022d98573 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jan 2022 07:56:14 +0100 Subject: unicode: clean up the Kconfig symbol confusion Turn the CONFIG_UNICODE symbol into a tristate that generates some always built in code and remove the confusing CONFIG_UNICODE_UTF8_DATA symbol. Note that a lot of the IS_ENABLED() checks could be turned from cpp statements into normal ifs, but this change is intended to be fairly mechanic, so that should be cleaned up later. Fixes: 2b3d04787012 ("unicode: Add utf8-data module") Reported-by: Linus Torvalds Reviewed-by: Eric Biggers Signed-off-by: Christoph Hellwig Signed-off-by: Gabriel Krisman Bertazi --- fs/Makefile | 2 +- fs/ext4/ext4.h | 14 +++++++------- fs/ext4/hash.c | 2 +- fs/ext4/namei.c | 12 ++++++------ fs/ext4/super.c | 10 +++++----- fs/ext4/sysfs.c | 8 ++++---- fs/f2fs/dir.c | 10 +++++----- fs/f2fs/f2fs.h | 2 +- fs/f2fs/hash.c | 2 +- fs/f2fs/namei.c | 4 ++-- fs/f2fs/recovery.c | 4 ++-- fs/f2fs/super.c | 10 +++++----- fs/f2fs/sysfs.c | 10 +++++----- fs/libfs.c | 10 +++++----- fs/unicode/Kconfig | 18 +++++------------- fs/unicode/Makefile | 6 ++++-- include/linux/fs.h | 2 +- 17 files changed, 60 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/fs/Makefile b/fs/Makefile index 84c5e4cdfee5..c71ee0127866 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs/ obj-$(CONFIG_NFSD) += nfsd/ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ -obj-$(CONFIG_UNICODE) += unicode/ +obj-y += unicode/ obj-$(CONFIG_SYSV_FS) += sysv/ obj-$(CONFIG_SMBFS_COMMON) += smbfs_common/ obj-$(CONFIG_CIFS) += cifs/ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 71a3cdceaa03..242e74cfb060 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2485,7 +2485,7 @@ struct ext4_filename { #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_str crypto_buf; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct fscrypt_str cf_name; #endif }; @@ -2721,7 +2721,7 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) extern int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *fname); @@ -2754,7 +2754,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif return err; @@ -2773,7 +2773,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); #endif return err; @@ -2790,7 +2790,7 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) fname->usr_fname = NULL; fname->disk_name.name = NULL; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) kfree(fname->cf_name.name); fname->cf_name.name = NULL; #endif @@ -2806,7 +2806,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir, fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif @@ -2822,7 +2822,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, static inline void ext4_fname_free_filename(struct ext4_filename *fname) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) kfree(fname->cf_name.name); fname->cf_name.name = NULL; #endif diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index f34f4176c1e7..147b5241dd94 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -290,7 +290,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, int ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) const struct unicode_map *um = dir->i_sb->s_encoding; int r, dlen; unsigned char *buff; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 52c9bd154122..269d2d051ede 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1317,7 +1317,7 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) dx_set_count(entries, count + 1); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Test whether a case-insensitive directory entry matches the filename * being searched for. If quick is set, assume the name being looked up @@ -1428,7 +1428,7 @@ static bool ext4_match(struct inode *parent, f.crypto_buf = fname->crypto_buf; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { if (fname->cf_name.name) { @@ -1800,7 +1800,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi } } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (!inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as @@ -2308,7 +2308,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) return -EINVAL; @@ -3126,7 +3126,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the @@ -3231,7 +3231,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry)); if (!retval) ext4_fc_track_unlink(handle, dentry); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the diff --git a/fs/ext4/super.c b/fs/ext4/super.c index db9fe4843529..52be1ca38eef 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1302,7 +1302,7 @@ static void ext4_put_super(struct super_block *sb) kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); @@ -1962,7 +1962,7 @@ static const struct mount_opts { {Opt_err, 0, 0} }; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) static const struct ext4_sb_encodings { __u16 magic; char *name; @@ -3609,7 +3609,7 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } -#ifndef CONFIG_UNICODE +#if !IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " @@ -4613,7 +4613,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err < 0) goto failed_mount; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; @@ -5517,7 +5517,7 @@ failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index f61e65ae27d8..d233c24ea342 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -309,7 +309,7 @@ EXT4_ATTR_FEATURE(meta_bg_resize); EXT4_ATTR_FEATURE(encryption); EXT4_ATTR_FEATURE(test_dummy_encryption_v2); #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) EXT4_ATTR_FEATURE(casefold); #endif #ifdef CONFIG_FS_VERITY @@ -317,7 +317,7 @@ EXT4_ATTR_FEATURE(verity); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); EXT4_ATTR_FEATURE(fast_commit); -#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) EXT4_ATTR_FEATURE(encrypted_casefold); #endif @@ -329,7 +329,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(encryption), ATTR_LIST(test_dummy_encryption_v2), #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(casefold), #endif #ifdef CONFIG_FS_VERITY @@ -337,7 +337,7 @@ static struct attribute *ext4_feat_attrs[] = { #endif ATTR_LIST(metadata_csum_seed), ATTR_LIST(fast_commit), -#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) ATTR_LIST(encrypted_casefold), #endif NULL, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 1820e9c106f7..166f08623362 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -16,7 +16,7 @@ #include "xattr.h" #include -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif @@ -79,7 +79,7 @@ unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de) int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = dir->i_sb; if (IS_CASEFOLDED(dir)) { @@ -174,7 +174,7 @@ void f2fs_free_filename(struct f2fs_filename *fname) kfree(fname->crypto_buf.name); fname->crypto_buf.name = NULL; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (fname->cf_name.name) { kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; @@ -208,7 +208,7 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, return f2fs_find_target_dentry(&d, fname, max_slots); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Test whether a case-insensitive directory entry matches the filename * being searched for. @@ -266,7 +266,7 @@ static inline int f2fs_match_name(const struct inode *dir, { struct fscrypt_name f; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (fname->cf_name.name) { struct qstr cf = FSTR_TO_QSTR(&fname->cf_name); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0d603187171..4da88928ffb5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -487,7 +487,7 @@ struct f2fs_filename { */ struct fscrypt_str crypto_buf; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * For casefolded directories: the casefolded name, but it's left NULL * if the original name is not valid Unicode, if the directory is both diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index e3beac546c63..3cb1e7a24740 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -105,7 +105,7 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) return; } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (IS_CASEFOLDED(dir)) { /* * If the casefolded name is provided, hash it instead of the diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a728a0af9ce0..5f213f05556d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -561,7 +561,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out_iput; } out_splice: -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (!inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as @@ -622,7 +622,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } f2fs_delete_entry(de, page, dir, inode); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d1664a0567ef..2fbbc820c00a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -46,7 +46,7 @@ static struct kmem_cache *fsync_entry_slab; -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif @@ -149,7 +149,7 @@ static int init_recovered_filename(const struct inode *dir, if (err) return err; f2fs_hash_filename(dir, fname); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* Case-sensitive match is fine for recovery */ kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 15f12ece0ac6..b870c6459fa1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -256,7 +256,7 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) va_end(args); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) static const struct f2fs_sb_encodings { __u16 magic; char *name; @@ -1218,7 +1218,7 @@ default_check: return -EINVAL; } #endif -#ifndef CONFIG_UNICODE +#if !IS_ENABLED(CONFIG_UNICODE) if (f2fs_sb_has_casefold(sbi)) { f2fs_err(sbi, "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); @@ -1578,7 +1578,7 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_iostat(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); @@ -3861,7 +3861,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) { const struct f2fs_sb_encodings *encoding_info; struct unicode_map *encoding; @@ -4412,7 +4412,7 @@ free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); sb->s_encoding = NULL; #endif diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8408f77764e8..fa3d9cb2d69b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -192,7 +192,7 @@ static ssize_t unusable_show(struct f2fs_attr *a, static ssize_t encoding_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = sbi->sb; if (f2fs_sb_has_casefold(sbi)) @@ -756,7 +756,7 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption); F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif #endif /* CONFIG_FS_ENCRYPTION */ @@ -775,7 +775,7 @@ F2FS_FEATURE_RO_ATTR(lost_found); F2FS_FEATURE_RO_ATTR(verity); #endif F2FS_FEATURE_RO_ATTR(sb_checksum); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) F2FS_FEATURE_RO_ATTR(casefold); #endif F2FS_FEATURE_RO_ATTR(readonly); @@ -886,7 +886,7 @@ static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), ATTR_LIST(test_dummy_encryption_v2), -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(encrypted_casefold), #endif #endif /* CONFIG_FS_ENCRYPTION */ @@ -905,7 +905,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(verity), #endif ATTR_LIST(sb_checksum), -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(casefold), #endif ATTR_LIST(readonly), diff --git a/fs/libfs.c b/fs/libfs.c index ba7438ab9371..974125270a42 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1379,7 +1379,7 @@ bool is_empty_dir_inode(struct inode *inode) (inode->i_op == &empty_dir_inode_operations); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Determine if the name of a dentry should be casefolded. * @@ -1473,7 +1473,7 @@ static const struct dentry_operations generic_encrypted_dentry_ops = { }; #endif -#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE) +#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) static const struct dentry_operations generic_encrypted_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, @@ -1508,10 +1508,10 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) #ifdef CONFIG_FS_ENCRYPTION bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) bool needs_ci_ops = dentry->d_sb->s_encoding; #endif -#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE) +#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) if (needs_encrypt_ops && needs_ci_ops) { d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops); return; @@ -1523,7 +1523,7 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) return; } #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (needs_ci_ops) { d_set_d_op(dentry, &generic_ci_dentry_ops); return; diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig index 610d7bc05d6e..da786a687fdc 100644 --- a/fs/unicode/Kconfig +++ b/fs/unicode/Kconfig @@ -3,21 +3,13 @@ # UTF-8 normalization # config UNICODE - bool "UTF-8 normalization and casefolding support" + tristate "UTF-8 normalization and casefolding support" help Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding - support. - -config UNICODE_UTF8_DATA - tristate "UTF-8 normalization and casefolding tables" - depends on UNICODE - default UNICODE - help - This contains a large table of case foldings, which can be loaded as - a separate module if you say M here. To be on the safe side stick - to the default of Y. Saying N here makes no sense, if you do not want - utf8 casefolding support, disable CONFIG_UNICODE instead. + support. If you say M here the large table of case foldings will + be a separate loadable module that gets requested only when a file + system actually use it. config UNICODE_NORMALIZATION_SELFTEST tristate "Test UTF-8 normalization support" - depends on UNICODE_UTF8_DATA + depends on UNICODE diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index 2f9d9188852b..0cc87423de82 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -1,8 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_UNICODE) += unicode.o +ifneq ($(CONFIG_UNICODE),) +obj-y += unicode.o +endif +obj-$(CONFIG_UNICODE) += utf8data.o obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o -obj-$(CONFIG_UNICODE_UTF8_DATA) += utf8data.o unicode-y := utf8-norm.o utf8-core.o diff --git a/include/linux/fs.h b/include/linux/fs.h index c8510da6cc6d..fdac22d16c2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1490,7 +1490,7 @@ struct super_block { #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; #endif -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif -- cgit v1.2.3 From ebb7fb1557b1d03b906b668aa2164b51e6b7d19a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 26 Jan 2022 09:19:20 -0800 Subject: xfs, iomap: limit individual ioend chain lengths in writeback Trond Myklebust reported soft lockups in XFS IO completion such as this: watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106] CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1 Workqueue: xfs-conv/md127 xfs_end_io [xfs] RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20 Call Trace: wake_up_page_bit+0x8a/0x110 iomap_finish_ioend+0xd7/0x1c0 iomap_finish_ioends+0x7f/0xb0 xfs_end_ioend+0x6b/0x100 [xfs] xfs_end_io+0xb9/0xe0 [xfs] process_one_work+0x1a7/0x360 worker_thread+0x1fa/0x390 kthread+0x116/0x130 ret_from_fork+0x35/0x40 Ioends are processed as an atomic completion unit when all the chained bios in the ioend have completed their IO. Logically contiguous ioends can also be merged and completed as a single, larger unit. Both of these things can be problematic as both the bio chains per ioend and the size of the merged ioends processed as a single completion are both unbound. If we have a large sequential dirty region in the page cache, write_cache_pages() will keep feeding us sequential pages and we will keep mapping them into ioends and bios until we get a dirty page at a non-sequential file offset. These large sequential runs can will result in bio and ioend chaining to optimise the io patterns. The pages iunder writeback are pinned within these chains until the submission chaining is broken, allowing the entire chain to be completed. This can result in huge chains being processed in IO completion context. We get deep bio chaining if we have large contiguous physical extents. We will keep adding pages to the current bio until it is full, then we'll chain a new bio to keep adding pages for writeback. Hence we can build bio chains that map millions of pages and tens of gigabytes of RAM if the page cache contains big enough contiguous dirty file regions. This long bio chain pins those pages until the final bio in the chain completes and the ioend can iterate all the chained bios and complete them. OTOH, if we have a physically fragmented file, we end up submitting one ioend per physical fragment that each have a small bio or bio chain attached to them. We do not chain these at IO submission time, but instead we chain them at completion time based on file offset via iomap_ioend_try_merge(). Hence we can end up with unbound ioend chains being built via completion merging. XFS can then do COW remapping or unwritten extent conversion on that merged chain, which involves walking an extent fragment at a time and running a transaction to modify the physical extent information. IOWs, we merge all the discontiguous ioends together into a contiguous file range, only to then process them individually as discontiguous extents. This extent manipulation is computationally expensive and can run in a tight loop, so merging logically contiguous but physically discontigous ioends gains us nothing except for hiding the fact the fact we broke the ioends up into individual physical extents at submission and then need to loop over those individual physical extents at completion. Hence we need to have mechanisms to limit ioend sizes and to break up completion processing of large merged ioend chains: 1. bio chains per ioend need to be bound in length. Pure overwrites go straight to iomap_finish_ioend() in softirq context with the exact bio chain attached to the ioend by submission. Hence the only way to prevent long holdoffs here is to bound ioend submission sizes because we can't reschedule in softirq context. 2. iomap_finish_ioends() has to handle unbound merged ioend chains correctly. This relies on any one call to iomap_finish_ioend() being bound in runtime so that cond_resched() can be issued regularly as the long ioend chain is processed. i.e. this relies on mechanism #1 to limit individual ioend sizes to work correctly. 3. filesystems have to loop over the merged ioends to process physical extent manipulations. This means they can loop internally, and so we break merging at physical extent boundaries so the filesystem can easily insert reschedule points between individual extent manipulations. Signed-off-by: Dave Chinner Reported-and-tested-by: Trond Myklebust Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++---- fs/xfs/xfs_aops.c | 16 +++++++++++++++- include/linux/iomap.h | 2 ++ 3 files changed, 65 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index c938bbad075e..6c51a75d0be6 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -21,6 +21,8 @@ #include "../internal.h" +#define IOEND_BATCH_SIZE 4096 + /* * Structure allocated for each folio when block size < folio size * to track sub-folio uptodate status and I/O completions. @@ -1039,7 +1041,7 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ -static void +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) { struct inode *inode = ioend->io_inode; @@ -1048,6 +1050,7 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) u64 start = bio->bi_iter.bi_sector; loff_t offset = ioend->io_offset; bool quiet = bio_flagged(bio, BIO_QUIET); + u32 folio_count = 0; for (bio = &ioend->io_inline_bio; bio; bio = next) { struct folio_iter fi; @@ -1062,9 +1065,11 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) next = bio->bi_private; /* walk all folios in bio, ending page IO on them */ - bio_for_each_folio_all(fi, bio) + bio_for_each_folio_all(fi, bio) { iomap_finish_folio_write(inode, fi.folio, fi.length, error); + folio_count++; + } bio_put(bio); } /* The ioend has been freed by bio_put() */ @@ -1074,20 +1079,36 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) "%s: writeback error on inode %lu, offset %lld, sector %llu", inode->i_sb->s_id, inode->i_ino, offset, start); } + return folio_count; } +/* + * Ioend completion routine for merged bios. This can only be called from task + * contexts as merged ioends can be of unbound length. Hence we have to break up + * the writeback completions into manageable chunks to avoid long scheduler + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get + * good batch processing throughput without creating adverse scheduler latency + * conditions. + */ void iomap_finish_ioends(struct iomap_ioend *ioend, int error) { struct list_head tmp; + u32 completions; + + might_sleep(); list_replace_init(&ioend->io_list, &tmp); - iomap_finish_ioend(ioend, error); + completions = iomap_finish_ioend(ioend, error); while (!list_empty(&tmp)) { + if (completions > IOEND_BATCH_SIZE * 8) { + cond_resched(); + completions = 0; + } ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); list_del_init(&ioend->io_list); - iomap_finish_ioend(ioend, error); + completions += iomap_finish_ioend(ioend, error); } } EXPORT_SYMBOL_GPL(iomap_finish_ioends); @@ -1108,6 +1129,18 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) return false; if (ioend->io_offset + ioend->io_size != next->io_offset) return false; + /* + * Do not merge physically discontiguous ioends. The filesystem + * completion functions will have to iterate the physical + * discontiguities even if we merge the ioends at a logical level, so + * we don't gain anything by merging physical discontiguities here. + * + * We cannot use bio->bi_iter.bi_sector here as it is modified during + * submission so does not point to the start sector of the bio at + * completion. + */ + if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) + return false; return true; } @@ -1209,8 +1242,10 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, ioend->io_flags = wpc->iomap.flags; ioend->io_inode = inode; ioend->io_size = 0; + ioend->io_folios = 0; ioend->io_offset = offset; ioend->io_bio = bio; + ioend->io_sector = sector; return ioend; } @@ -1251,6 +1286,13 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, return false; if (sector != bio_end_sector(wpc->ioend->io_bio)) return false; + /* + * Limit ioend bio chain lengths to minimise IO completion latency. This + * also prevents long tight loops ending page writeback on all the + * folios in the ioend. + */ + if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE) + return false; return true; } @@ -1335,6 +1377,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, &submit_list); count++; } + if (count) + wpc->ioend->io_folios++; WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); WARN_ON_ONCE(!folio_test_locked(folio)); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2705f91bdd0d..9d6a67c7d227 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -136,7 +136,20 @@ done: memalloc_nofs_restore(nofs_flag); } -/* Finish all pending io completions. */ +/* + * Finish all pending IO completions that require transactional modifications. + * + * We try to merge physical and logically contiguous ioends before completion to + * minimise the number of transactions we need to perform during IO completion. + * Both unwritten extent conversion and COW remapping need to iterate and modify + * one physical extent at a time, so we gain nothing by merging physically + * discontiguous extents here. + * + * The ioend chain length that we can be processing here is largely unbound in + * length and we may have to perform significant amounts of work on each ioend + * to complete it. Hence we have to be careful about holding the CPU for too + * long in this loop. + */ void xfs_end_io( struct work_struct *work) @@ -157,6 +170,7 @@ xfs_end_io( list_del_init(&ioend->io_list); iomap_ioend_try_merge(ioend, &tmp); xfs_end_ioend(ioend); + cond_resched(); } } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b55bd49e55f5..97a3a2edb585 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -263,9 +263,11 @@ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_type; u16 io_flags; /* IOMAP_F_* */ + u32 io_folios; /* folios added to ioend */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ + sector_t io_sector; /* start sector of ioend */ struct bio *io_bio; /* bio being built */ struct bio io_inline_bio; /* MUST BE LAST! */ }; -- cgit v1.2.3 From d01ffb9eee4af165d83b08dd73ebdf9fe94a519b Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 28 Jan 2022 12:47:16 +0800 Subject: ax25: add refcount in ax25_dev to avoid UAF bugs If we dereference ax25_dev after we call kfree(ax25_dev) in ax25_dev_device_down(), it will lead to concurrency UAF bugs. There are eight syscall functions suffer from UAF bugs, include ax25_bind(), ax25_release(), ax25_connect(), ax25_ioctl(), ax25_getname(), ax25_sendmsg(), ax25_getsockopt() and ax25_info_show(). One of the concurrency UAF can be shown as below: (USE) | (FREE) | ax25_device_event | ax25_dev_device_down ax25_bind | ... ... | kfree(ax25_dev) ax25_fillin_cb() | ... ax25_fillin_cb_from_dev() | ... | The root cause of UAF bugs is that kfree(ax25_dev) in ax25_dev_device_down() is not protected by any locks. When ax25_dev, which there are still pointers point to, is released, the concurrency UAF bug will happen. This patch introduces refcount into ax25_dev in order to guarantee that there are no pointers point to it when ax25_dev is released. Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller --- include/net/ax25.h | 10 ++++++++++ net/ax25/af_ax25.c | 2 ++ net/ax25/ax25_dev.c | 12 ++++++++++-- net/ax25/ax25_route.c | 3 +++ 4 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ax25.h b/include/net/ax25.h index 526e49589197..50b417df6221 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -239,6 +239,7 @@ typedef struct ax25_dev { #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_dama_info dama; #endif + refcount_t refcount; } ax25_dev; typedef struct ax25_cb { @@ -293,6 +294,15 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } +#define ax25_dev_hold(__ax25_dev) \ + refcount_inc(&((__ax25_dev)->refcount)) + +static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +{ + if (refcount_dec_and_test(&ax25_dev->refcount)) { + kfree(ax25_dev); + } +} static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->dev = dev; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 44a8730c26ac..32f61978ff29 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -91,6 +91,7 @@ again: spin_unlock_bh(&ax25_list_lock); lock_sock(sk); s->ax25_dev = NULL; + ax25_dev_put(ax25_dev); release_sock(sk); ax25_disconnect(s, ENETUNREACH); spin_lock_bh(&ax25_list_lock); @@ -439,6 +440,7 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) } out_put: + ax25_dev_put(ax25_dev); ax25_cb_put(ax25); return ret; diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 256fadb94df3..770b787fb7bb 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -37,6 +37,7 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr) for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; + ax25_dev_hold(ax25_dev); } spin_unlock_bh(&ax25_dev_lock); @@ -56,6 +57,7 @@ void ax25_dev_device_up(struct net_device *dev) return; } + refcount_set(&ax25_dev->refcount, 1); dev->ax25_ptr = ax25_dev; ax25_dev->dev = dev; dev_hold_track(dev, &ax25_dev->dev_tracker, GFP_ATOMIC); @@ -83,6 +85,7 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; + ax25_dev_hold(ax25_dev); spin_unlock_bh(&ax25_dev_lock); ax25_register_dev_sysctl(ax25_dev); @@ -112,20 +115,22 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; + ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; dev_put_track(dev, &ax25_dev->dev_tracker); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; } while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; + ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; dev_put_track(dev, &ax25_dev->dev_tracker); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; } @@ -133,6 +138,7 @@ void ax25_dev_device_down(struct net_device *dev) } spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; + ax25_dev_put(ax25_dev); } int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) @@ -149,6 +155,7 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) if (ax25_dev->forward != NULL) return -EINVAL; ax25_dev->forward = fwd_dev->dev; + ax25_dev_put(fwd_dev); break; case SIOCAX25DELFWD: @@ -161,6 +168,7 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) return -EINVAL; } + ax25_dev_put(ax25_dev); return 0; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index d0b2e094bd55..1e32693833e5 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -116,6 +116,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; + ax25_dev_put(ax25_dev); if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); @@ -172,6 +173,7 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } } + ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); return 0; @@ -214,6 +216,7 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) } out: + ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); return err; } -- cgit v1.2.3 From 3c75c0ea5da749bd1efebd1387f2e5011b8c7d78 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 19 Jan 2022 16:52:48 +0100 Subject: ASoC: soc-pcm: Fix DPCM lockdep warning due to nested stream locks The recent change for DPCM locking caused spurious lockdep warnings. Actually the warnings are false-positive, as those are triggered due to the nested stream locks for FE and BE. Since both locks belong to the same lock class, lockdep sees it as if a deadlock. For fixing this, we need to take PCM stream locks for BE with the nested lock primitives. Since currently snd_pcm_stream_lock*() helper assumes only the top-level single locking, a new helper function snd_pcm_stream_lock_irqsave_nested() is defined for a single-depth nested lock, which is now used in the BE DAI trigger that is always performed inside a FE stream lock. Fixes: b2ae80663008 ("ASoC: soc-pcm: serialize BE triggers") Reported-and-tested-by: Hans de Goede Reported-and-tested-by: Marek Szyprowski Link: https://lore.kernel.org/r/73018f3c-9769-72ea-0325-b3f8e2381e30@redhat.com Link: https://lore.kernel.org/alsa-devel/9a0abddd-49e9-872d-2f00-a1697340f786@samsung.com Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20220119155249.26754-2-tiwai@suse.de Signed-off-by: Mark Brown --- include/sound/pcm.h | 15 +++++++++++++++ sound/core/pcm_native.c | 13 +++++++++++++ sound/soc/soc-pcm.c | 6 +++--- 3 files changed, 31 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 33451f8ff755..524220fe1af6 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -614,6 +614,7 @@ void snd_pcm_stream_unlock(struct snd_pcm_substream *substream); void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream); void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream); unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream); +unsigned long _snd_pcm_stream_lock_irqsave_nested(struct snd_pcm_substream *substream); /** * snd_pcm_stream_lock_irqsave - Lock the PCM stream @@ -632,6 +633,20 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream); void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream, unsigned long flags); +/** + * snd_pcm_stream_lock_irqsave_nested - Single-nested PCM stream locking + * @substream: PCM substream + * @flags: irq flags + * + * This locks the PCM stream like snd_pcm_stream_lock_irqsave() but with + * the single-depth lockdep subclass. + */ +#define snd_pcm_stream_lock_irqsave_nested(substream, flags) \ + do { \ + typecheck(unsigned long, flags); \ + flags = _snd_pcm_stream_lock_irqsave_nested(substream); \ + } while (0) + /** * snd_pcm_group_for_each_entry - iterate over the linked substreams * @s: the iterator diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 621883e71194..a056b3ef3c84 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -172,6 +172,19 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream) } EXPORT_SYMBOL_GPL(_snd_pcm_stream_lock_irqsave); +unsigned long _snd_pcm_stream_lock_irqsave_nested(struct snd_pcm_substream *substream) +{ + unsigned long flags = 0; + if (substream->pcm->nonatomic) + mutex_lock_nested(&substream->self_group.mutex, + SINGLE_DEPTH_NESTING); + else + spin_lock_irqsave_nested(&substream->self_group.lock, flags, + SINGLE_DEPTH_NESTING); + return flags; +} +EXPORT_SYMBOL_GPL(_snd_pcm_stream_lock_irqsave_nested); + /** * snd_pcm_stream_unlock_irqrestore - Unlock the PCM stream * @substream: PCM substream diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index 7abfc48b26ca..e8876e65c649 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -46,8 +46,8 @@ static inline void snd_soc_dpcm_stream_lock_irq(struct snd_soc_pcm_runtime *rtd, snd_pcm_stream_lock_irq(snd_soc_dpcm_get_substream(rtd, stream)); } -#define snd_soc_dpcm_stream_lock_irqsave(rtd, stream, flags) \ - snd_pcm_stream_lock_irqsave(snd_soc_dpcm_get_substream(rtd, stream), flags) +#define snd_soc_dpcm_stream_lock_irqsave_nested(rtd, stream, flags) \ + snd_pcm_stream_lock_irqsave_nested(snd_soc_dpcm_get_substream(rtd, stream), flags) static inline void snd_soc_dpcm_stream_unlock_irq(struct snd_soc_pcm_runtime *rtd, int stream) @@ -2094,7 +2094,7 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream, be = dpcm->be; be_substream = snd_soc_dpcm_get_substream(be, stream); - snd_soc_dpcm_stream_lock_irqsave(be, stream, flags); + snd_soc_dpcm_stream_lock_irqsave_nested(be, stream, flags); /* is this op for this BE ? */ if (!snd_soc_dpcm_be_can_update(fe, be, stream)) -- cgit v1.2.3 From 06feec6005c9d9500cd286ec440aabf8b2ddd94d Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Wed, 12 Jan 2022 22:50:39 +0300 Subject: ASoC: hdmi-codec: Fix OOB memory accesses Correct size of iec_status array by changing it to the size of status array of the struct snd_aes_iec958. This fixes out-of-bounds slab read accesses made by memcpy() of the hdmi-codec driver. This problem is reported by KASAN. Cc: stable@vger.kernel.org Signed-off-by: Dmitry Osipenko Link: https://lore.kernel.org/r/20220112195039.1329-1-digetx@gmail.com Signed-off-by: Mark Brown --- include/uapi/sound/asound.h | 4 +++- sound/soc/codecs/hdmi-codec.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index ff7e638221c5..228279ea0670 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -56,8 +56,10 @@ * * ****************************************************************************/ +#define AES_IEC958_STATUS_SIZE 24 + struct snd_aes_iec958 { - unsigned char status[24]; /* AES/IEC958 channel status bits */ + unsigned char status[AES_IEC958_STATUS_SIZE]; /* AES/IEC958 channel status bits */ unsigned char subcode[147]; /* AES/IEC958 subcode bits */ unsigned char pad; /* nothing */ unsigned char dig_subframe[4]; /* AES/IEC958 subframe bits */ diff --git a/sound/soc/codecs/hdmi-codec.c b/sound/soc/codecs/hdmi-codec.c index b61f980cabdc..b07607a9ecea 100644 --- a/sound/soc/codecs/hdmi-codec.c +++ b/sound/soc/codecs/hdmi-codec.c @@ -277,7 +277,7 @@ struct hdmi_codec_priv { bool busy; struct snd_soc_jack *jack; unsigned int jack_status; - u8 iec_status[5]; + u8 iec_status[AES_IEC958_STATUS_SIZE]; }; static const struct snd_soc_dapm_widget hdmi_widgets[] = { -- cgit v1.2.3 From f6c6804c43fa18d3cee64b55490dfbd3bef1363a Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 28 Jan 2022 15:40:25 +0000 Subject: kvm: Move KVM_GET_XSAVE2 IOCTL definition at the end of kvm.h This way we can more easily find the next free IOCTL number when adding new IOCTLs. Fixes: be50b2065dfa ("kvm: x86: Add support for getting/setting expanded xstate buffer") Signed-off-by: Janosch Frank Message-Id: <20220128154025.102666-1-frankja@linux.ibm.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b46bcdb0cab1..5191b57e1562 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1624,9 +1624,6 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) -/* Available with KVM_CAP_XSAVE2 */ -#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) - struct kvm_s390_pv_sec_parm { __u64 origin; __u64 length; @@ -2048,4 +2045,7 @@ struct kvm_stats_desc { #define KVM_GET_STATS_FD _IO(KVMIO, 0xce) +/* Available with KVM_CAP_XSAVE2 */ +#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) + #endif /* __LINUX_KVM_H */ -- cgit v1.2.3 From ef9989afda73332df566852d6e9ca695c05f10ce Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 1 Feb 2022 13:29:22 +0000 Subject: kvm: add guest_state_{enter,exit}_irqoff() When transitioning to/from guest mode, it is necessary to inform lockdep, tracing, and RCU in a specific order, similar to the requirements for transitions to/from user mode. Additionally, it is necessary to perform vtime accounting for a window around running the guest, with RCU enabled, such that timer interrupts taken from the guest can be accounted as guest time. Most architectures don't handle all the necessary pieces, and a have a number of common bugs, including unsafe usage of RCU during the window between guest_enter() and guest_exit(). On x86, this was dealt with across commits: 87fa7f3e98a1310e ("x86/kvm: Move context tracking where it belongs") 0642391e2139a2c1 ("x86/kvm/vmx: Add hardirq tracing to guest enter/exit") 9fc975e9efd03e57 ("x86/kvm/svm: Add hardirq tracing on guest enter/exit") 3ebccdf373c21d86 ("x86/kvm/vmx: Move guest enter/exit into .noinstr.text") 135961e0a7d555fc ("x86/kvm/svm: Move guest enter/exit into .noinstr.text") 160457140187c5fb ("KVM: x86: Defer vtime accounting 'til after IRQ handling") bc908e091b326467 ("KVM: x86: Consolidate guest enter/exit logic to common helpers") ... but those fixes are specific to x86, and as the resulting logic (while correct) is split across generic helper functions and x86-specific helper functions, it is difficult to see that the entry/exit accounting is balanced. This patch adds generic helpers which architectures can use to handle guest entry/exit consistently and correctly. The guest_{enter,exit}() helpers are split into guest_timing_{enter,exit}() to perform vtime accounting, and guest_context_{enter,exit}() to perform the necessary context tracking and RCU management. The existing guest_{enter,exit}() heleprs are left as wrappers of these. Atop this, new guest_state_enter_irqoff() and guest_state_exit_irqoff() helpers are added to handle the ordering of lockdep, tracing, and RCU manageent. These are inteneded to mirror exit_to_user_mode() and enter_from_user_mode(). Subsequent patches will migrate architectures over to the new helpers, following a sequence: guest_timing_enter_irqoff(); guest_state_enter_irqoff(); < run the vcpu > guest_state_exit_irqoff(); < take any pending IRQs > guest_timing_exit_irqoff(); This sequences handles all of the above correctly, and more clearly balances the entry and exit portions, making it easier to understand. The existing helpers are marked as deprecated, and will be removed once all architectures have been converted. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Marc Zyngier Reviewed-by: Paolo Bonzini Reviewed-by: Nicolas Saenz Julienne Message-Id: <20220201132926.3301912-2-mark.rutland@arm.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 112 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f079820f52b5..b3810976a27f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -29,7 +29,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -368,8 +370,11 @@ struct kvm_vcpu { u64 last_used_slot_gen; }; -/* must be called with irqs disabled */ -static __always_inline void guest_enter_irqoff(void) +/* + * Start accounting time towards a guest. + * Must be called before entering guest context. + */ +static __always_inline void guest_timing_enter_irqoff(void) { /* * This is running in ioctl context so its safe to assume that it's the @@ -378,7 +383,18 @@ static __always_inline void guest_enter_irqoff(void) instrumentation_begin(); vtime_account_guest_enter(); instrumentation_end(); +} +/* + * Enter guest context and enter an RCU extended quiescent state. + * + * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is + * unsafe to use any code which may directly or indirectly use RCU, tracing + * (including IRQ flag tracing), or lockdep. All code in this period must be + * non-instrumentable. + */ +static __always_inline void guest_context_enter_irqoff(void) +{ /* * KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode @@ -394,16 +410,79 @@ static __always_inline void guest_enter_irqoff(void) } } -static __always_inline void guest_exit_irqoff(void) +/* + * Deprecated. Architectures should move to guest_timing_enter_irqoff() and + * guest_state_enter_irqoff(). + */ +static __always_inline void guest_enter_irqoff(void) +{ + guest_timing_enter_irqoff(); + guest_context_enter_irqoff(); +} + +/** + * guest_state_enter_irqoff - Fixup state when entering a guest + * + * Entry to a guest will enable interrupts, but the kernel state is interrupts + * disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Tell lockdep that interrupts are enabled + * + * Invoked from architecture specific code before entering a guest. + * Must be called with interrupts disabled and the caller must be + * non-instrumentable. + * The caller has to invoke guest_timing_enter_irqoff() before this. + * + * Note: this is analogous to exit_to_user_mode(). + */ +static __always_inline void guest_state_enter_irqoff(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_context_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + +/* + * Exit guest context and exit an RCU extended quiescent state. + * + * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is + * unsafe to use any code which may directly or indirectly use RCU, tracing + * (including IRQ flag tracing), or lockdep. All code in this period must be + * non-instrumentable. + */ +static __always_inline void guest_context_exit_irqoff(void) { context_tracking_guest_exit(); +} +/* + * Stop accounting time towards a guest. + * Must be called after exiting guest context. + */ +static __always_inline void guest_timing_exit_irqoff(void) +{ instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ vtime_account_guest_exit(); instrumentation_end(); } +/* + * Deprecated. Architectures should move to guest_state_exit_irqoff() and + * guest_timing_exit_irqoff(). + */ +static __always_inline void guest_exit_irqoff(void) +{ + guest_context_exit_irqoff(); + guest_timing_exit_irqoff(); +} + static inline void guest_exit(void) { unsigned long flags; @@ -413,6 +492,33 @@ static inline void guest_exit(void) local_irq_restore(flags); } +/** + * guest_state_exit_irqoff - Establish state when returning from guest mode + * + * Entry from a guest disables interrupts, but guest mode is traced as + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + * + * Invoked from architecture specific code after exiting a guest. + * Must be invoked with interrupts disabled and the caller must be + * non-instrumentable. + * The caller has to invoke guest_timing_exit_irqoff() after this. + * + * Note: this is analogous to enter_from_user_mode(). + */ +static __always_inline void guest_state_exit_irqoff(void) +{ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_context_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { /* -- cgit v1.2.3 From bee9f65523218e3baeeecde9295c8fbe9bc08e0a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Jan 2022 16:02:50 +0000 Subject: netfs, cachefiles: Add a method to query presence of data in the cache Add a netfs_cache_ops method by which a network filesystem can ask the cache about what data it has available and where so that it can make a multipage read more efficient. Signed-off-by: David Howells cc: linux-cachefs@redhat.com Acked-by: Jeff Layton Reviewed-by: Rohith Surabattula Signed-off-by: Steve French --- Documentation/filesystems/netfs_library.rst | 16 ++++++++ fs/cachefiles/io.c | 59 +++++++++++++++++++++++++++++ include/linux/netfs.h | 7 ++++ 3 files changed, 82 insertions(+) (limited to 'include') diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index 136f8da3d0e2..4f373a8ec47b 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -462,6 +462,10 @@ operation table looks like the following:: struct iov_iter *iter, netfs_io_terminated_t term_func, void *term_func_priv); + + int (*query_occupancy)(struct netfs_cache_resources *cres, + loff_t start, size_t len, size_t granularity, + loff_t *_data_start, size_t *_data_len); }; With a termination handler function pointer:: @@ -536,6 +540,18 @@ The methods defined in the table are: indicating whether the termination is definitely happening in the caller's context. + * ``query_occupancy()`` + + [Required] Called to find out where the next piece of data is within a + particular region of the cache. The start and length of the region to be + queried are passed in, along with the granularity to which the answer needs + to be aligned. The function passes back the start and length of the data, + if any, available within that region. Note that there may be a hole at the + front. + + It returns 0 if some data was found, -ENODATA if there was no usable data + within the region or -ENOBUFS if there is no caching on this file. + Note that these methods are passed a pointer to the cache resource structure, not the read request structure as they could be used in other situations where there isn't a read request structure as well, such as writing dirty data to the diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 04eb52736990..753986ea1583 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -191,6 +191,64 @@ presubmission_error: return ret; } +/* + * Query the occupancy of the cache in a region, returning where the next chunk + * of data starts and how long it is. + */ +static int cachefiles_query_occupancy(struct netfs_cache_resources *cres, + loff_t start, size_t len, size_t granularity, + loff_t *_data_start, size_t *_data_len) +{ + struct cachefiles_object *object; + struct file *file; + loff_t off, off2; + + *_data_start = -1; + *_data_len = 0; + + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ)) + return -ENOBUFS; + + object = cachefiles_cres_object(cres); + file = cachefiles_cres_file(cres); + granularity = max_t(size_t, object->volume->cache->bsize, granularity); + + _enter("%pD,%li,%llx,%zx/%llx", + file, file_inode(file)->i_ino, start, len, + i_size_read(file_inode(file))); + + off = cachefiles_inject_read_error(); + if (off == 0) + off = vfs_llseek(file, start, SEEK_DATA); + if (off == -ENXIO) + return -ENODATA; /* Beyond EOF */ + if (off < 0 && off >= (loff_t)-MAX_ERRNO) + return -ENOBUFS; /* Error. */ + if (round_up(off, granularity) >= start + len) + return -ENODATA; /* No data in range */ + + off2 = cachefiles_inject_read_error(); + if (off2 == 0) + off2 = vfs_llseek(file, off, SEEK_HOLE); + if (off2 == -ENXIO) + return -ENODATA; /* Beyond EOF */ + if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO) + return -ENOBUFS; /* Error. */ + + /* Round away partial blocks */ + off = round_up(off, granularity); + off2 = round_down(off2, granularity); + if (off2 <= off) + return -ENODATA; + + *_data_start = off; + if (off2 > start + len) + *_data_len = len; + else + *_data_len = off2 - off; + return 0; +} + /* * Handle completion of a write to the cache. */ @@ -545,6 +603,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { .write = cachefiles_write, .prepare_read = cachefiles_prepare_read, .prepare_write = cachefiles_prepare_write, + .query_occupancy = cachefiles_query_occupancy, }; /* diff --git a/include/linux/netfs.h b/include/linux/netfs.h index b46c39d98bbd..614f22213e21 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -244,6 +244,13 @@ struct netfs_cache_ops { int (*prepare_write)(struct netfs_cache_resources *cres, loff_t *_start, size_t *_len, loff_t i_size, bool no_space_allocated_yet); + + /* Query the occupancy of the cache in a region, returning where the + * next chunk of data starts and how long it is. + */ + int (*query_occupancy)(struct netfs_cache_resources *cres, + loff_t start, size_t len, size_t granularity, + loff_t *_data_start, size_t *_data_len); }; struct readahead_control; -- cgit v1.2.3 From 6d5c900eb64107001e91e1f46bddc254dded8a59 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 24 Jan 2022 09:22:41 -0800 Subject: net/mlx5e: Use struct_group() for memcpy() region In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally writing across neighboring fields. Use struct_group() in struct vlan_ethhdr around members h_dest and h_source, so they can be referenced together. This will allow memcpy() and sizeof() to more easily reason about sizes, improve readability, and avoid future warnings about writing beyond the end of h_dest. "pahole" shows no size nor member offset changes to struct vlan_ethhdr. "objdump -d" shows no object code changes. Fixes: 34802a42b352 ("net/mlx5e: Do not modify the TX SKB") Signed-off-by: Kees Cook Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- include/linux/if_vlan.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 7fd33b356cc8..ee7ecb88adc1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -208,7 +208,7 @@ static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs) int cpy1_sz = 2 * ETH_ALEN; int cpy2_sz = ihs - cpy1_sz; - memcpy(vhdr, skb->data, cpy1_sz); + memcpy(&vhdr->addrs, skb->data, cpy1_sz); vhdr->h_vlan_proto = skb->vlan_proto; vhdr->h_vlan_TCI = cpu_to_be16(skb_vlan_tag_get(skb)); memcpy(&vhdr->h_vlan_encapsulated_proto, skb->data + cpy1_sz, cpy2_sz); diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 8420fe504927..2be4dd7e90a9 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -46,8 +46,10 @@ struct vlan_hdr { * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_ethhdr { - unsigned char h_dest[ETH_ALEN]; - unsigned char h_source[ETH_ALEN]; + struct_group(addrs, + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + ); __be16 h_vlan_proto; __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; -- cgit v1.2.3 From ddecd22878601a606d160680fa85802b75d92eb6 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 31 Jan 2022 11:34:07 +0100 Subject: perf: uapi: Document perf_event_attr::sig_data truncation on 32 bit architectures Due to the alignment requirements of siginfo_t, as described in 3ddb3fd8cdb0 ("signal, perf: Fix siginfo_t by avoiding u64 on 32-bit architectures"), siginfo_t::si_perf_data is limited to an unsigned long. However, perf_event_attr::sig_data is an u64, to avoid having to deal with compat conversions. Due to being an u64, it may not immediately be clear to users that sig_data is truncated on 32 bit architectures. Add a comment to explicitly point this out, and hopefully help some users save time by not having to deduce themselves what's happening. Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20220131103407.1971678-3-elver@google.com --- include/uapi/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1b65042ab1db..82858b697c05 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -465,6 +465,8 @@ struct perf_event_attr { /* * User provided data if sigtrap=1, passed back to user via * siginfo_t::si_perf_data, e.g. to permit user to identify the event. + * Note, siginfo_t::si_perf_data is long-sized, and sig_data will be + * truncated accordingly on 32 bit architectures. */ __u64 sig_data; }; -- cgit v1.2.3 From 1148836fd3226c20de841084aba24184d4fbbe77 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 2 Feb 2022 14:55:29 +0100 Subject: Revert "fbdev: Garbage collect fbdev scrolling acceleration, part 1 (from TODO list)" This reverts commit b3ec8cdf457e5e63d396fe1346cc788cf7c1b578. Revert the second (of 2) commits which disabled scrolling acceleration in fbcon/fbdev. It introduced a regression for fbdev-supported graphic cards because of the performance penalty by doing screen scrolling by software instead of using the existing graphic card 2D hardware acceleration. Console scrolling acceleration was disabled by dropping code which checked at runtime the driver hardware capabilities for the BINFO_HWACCEL_COPYAREA or FBINFO_HWACCEL_FILLRECT flags and if set, it enabled scrollmode SCROLL_MOVE which uses hardware acceleration to move screen contents. After dropping those checks scrollmode was hard-wired to SCROLL_REDRAW instead, which forces all graphic cards to redraw every character at the new screen position when scrolling. This change effectively disabled all hardware-based scrolling acceleration for ALL drivers, because now all kind of 2D hardware acceleration (bitblt, fillrect) in the drivers isn't used any longer. The original commit message mentions that only 3 DRM drivers (nouveau, omapdrm and gma500) used hardware acceleration in the past and thus code for checking and using scrolling acceleration is obsolete. This statement is NOT TRUE, because beside the DRM drivers there are around 35 other fbdev drivers which depend on fbdev/fbcon and still provide hardware acceleration for fbdev/fbcon. The original commit message also states that syzbot found lots of bugs in fbcon and thus it's "often the solution to just delete code and remove features". This is true, and the bugs - which actually affected all users of fbcon, including DRM - were fixed, or code was dropped like e.g. the support for software scrollback in vgacon (commit 973c096f6a85). So to further analyze which bugs were found by syzbot, I've looked through all patches in drivers/video which were tagged with syzbot or syzkaller back to year 2005. The vast majority fixed the reported issues on a higher level, e.g. when screen is to be resized, or when font size is to be changed. The few ones which touched driver code fixed a real driver bug, e.g. by adding a check. But NONE of those patches touched code of either the SCROLL_MOVE or the SCROLL_REDRAW case. That means, there was no real reason why SCROLL_MOVE had to be ripped-out and just SCROLL_REDRAW had to be used instead. The only reason I can imagine so far was that SCROLL_MOVE wasn't used by DRM and as such it was assumed that it could go away. That argument completely missed the fact that SCROLL_MOVE is still heavily used by fbdev (non-DRM) drivers. Some people mention that using memcpy() instead of the hardware acceleration is pretty much the same speed. But that's not true, at least not for older graphic cards and machines where we see speed decreases by factor 10 and more and thus this change leads to console responsiveness way worse than before. That's why the original commit is to be reverted. By reverting we reintroduce hardware-based scrolling acceleration and fix the performance regression for fbdev drivers. There isn't any impact on DRM when reverting those patches. Signed-off-by: Helge Deller Acked-by: Geert Uytterhoeven Acked-by: Sven Schnelle Cc: stable@vger.kernel.org # v5.16+ Signed-off-by: Helge Deller Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20220202135531.92183-2-deller@gmx.de --- Documentation/gpu/todo.rst | 13 +- drivers/video/fbdev/core/bitblit.c | 16 + drivers/video/fbdev/core/fbcon.c | 509 ++++++++++++++++++++++++++++++-- drivers/video/fbdev/core/fbcon.h | 59 ++++ drivers/video/fbdev/core/fbcon_ccw.c | 28 +- drivers/video/fbdev/core/fbcon_cw.c | 28 +- drivers/video/fbdev/core/fbcon_rotate.h | 9 + drivers/video/fbdev/core/fbcon_ud.c | 37 ++- drivers/video/fbdev/core/tileblit.c | 16 + drivers/video/fbdev/skeletonfb.c | 12 +- include/linux/fb.h | 2 +- 11 files changed, 678 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst index da138dd39883..29506815d24a 100644 --- a/Documentation/gpu/todo.rst +++ b/Documentation/gpu/todo.rst @@ -303,19 +303,16 @@ Level: Advanced Garbage collect fbdev scrolling acceleration -------------------------------------------- -Scroll acceleration has been disabled in fbcon. Now it works as the old -SCROLL_REDRAW mode. A ton of code was removed in fbcon.c and the hook bmove was -removed from fbcon_ops. -Remaining tasks: +Scroll acceleration is disabled in fbcon by hard-wiring p->scrollmode = +SCROLL_REDRAW. There's a ton of code this will allow us to remove: -- a bunch of the hooks in fbcon_ops could be removed or simplified by calling +- lots of code in fbcon.c + +- a bunch of the hooks in fbcon_ops, maybe the remaining hooks could be called directly instead of the function table (with a switch on p->rotate) - fb_copyarea is unused after this, and can be deleted from all drivers -- after that, fb_copyarea can be deleted from fb_ops in include/linux/fb.h as - well as cfb_copyarea - Note that not all acceleration code can be deleted, since clearing and cursor support is still accelerated, which might be good candidates for further deletion projects. diff --git a/drivers/video/fbdev/core/bitblit.c b/drivers/video/fbdev/core/bitblit.c index 01fae2c96965..f98e8f298bc1 100644 --- a/drivers/video/fbdev/core/bitblit.c +++ b/drivers/video/fbdev/core/bitblit.c @@ -43,6 +43,21 @@ static void update_attr(u8 *dst, u8 *src, int attribute, } } +static void bit_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fb_copyarea area; + + area.sx = sx * vc->vc_font.width; + area.sy = sy * vc->vc_font.height; + area.dx = dx * vc->vc_font.width; + area.dy = dy * vc->vc_font.height; + area.height = height * vc->vc_font.height; + area.width = width * vc->vc_font.width; + + info->fbops->fb_copyarea(info, &area); +} + static void bit_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { @@ -378,6 +393,7 @@ static int bit_update_start(struct fb_info *info) void fbcon_set_bitops(struct fbcon_ops *ops) { + ops->bmove = bit_bmove; ops->clear = bit_clear; ops->putcs = bit_putcs; ops->clear_margins = bit_clear_margins; diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 99ecd9a6d844..fc34caddf9cf 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -173,6 +173,8 @@ static void fbcon_putcs(struct vc_data *vc, const unsigned short *s, int count, int ypos, int xpos); static void fbcon_clear_margins(struct vc_data *vc, int bottom_only); static void fbcon_cursor(struct vc_data *vc, int mode); +static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx, + int height, int width); static int fbcon_switch(struct vc_data *vc); static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch); static void fbcon_set_palette(struct vc_data *vc, const unsigned char *table); @@ -180,8 +182,16 @@ static void fbcon_set_palette(struct vc_data *vc, const unsigned char *table); /* * Internal routines */ +static __inline__ void ywrap_up(struct vc_data *vc, int count); +static __inline__ void ywrap_down(struct vc_data *vc, int count); +static __inline__ void ypan_up(struct vc_data *vc, int count); +static __inline__ void ypan_down(struct vc_data *vc, int count); +static void fbcon_bmove_rec(struct vc_data *vc, struct fbcon_display *p, int sy, int sx, + int dy, int dx, int height, int width, u_int y_break); static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var, int unit); +static void fbcon_redraw_move(struct vc_data *vc, struct fbcon_display *p, + int line, int count, int dy); static void fbcon_modechanged(struct fb_info *info); static void fbcon_set_all_vcs(struct fb_info *info); static void fbcon_start(void); @@ -1125,6 +1135,14 @@ static void fbcon_init(struct vc_data *vc, int init) ops->graphics = 0; + /* + * No more hw acceleration for fbcon. + * + * FIXME: Garbage collect all the now dead code after sufficient time + * has passed. + */ + p->scrollmode = SCROLL_REDRAW; + /* * ++guenther: console.c:vc_allocate() relies on initializing * vc_{cols,rows}, but we must not set those if we are only @@ -1211,13 +1229,14 @@ finished: * This system is now divided into two levels because of complications * caused by hardware scrolling. Top level functions: * - * fbcon_clear(), fbcon_putc(), fbcon_clear_margins() + * fbcon_bmove(), fbcon_clear(), fbcon_putc(), fbcon_clear_margins() * * handles y values in range [0, scr_height-1] that correspond to real * screen positions. y_wrap shift means that first line of bitmap may be * anywhere on this display. These functions convert lineoffsets to * bitmap offsets and deal with the wrap-around case by splitting blits. * + * fbcon_bmove_physical_8() -- These functions fast implementations * fbcon_clear_physical_8() -- of original fbcon_XXX fns. * fbcon_putc_physical_8() -- (font width != 8) may be added later * @@ -1390,6 +1409,224 @@ static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var, } } +static __inline__ void ywrap_up(struct vc_data *vc, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + p->yscroll += count; + if (p->yscroll >= p->vrows) /* Deal with wrap */ + p->yscroll -= p->vrows; + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode |= FB_VMODE_YWRAP; + ops->update_start(info); + scrollback_max += count; + if (scrollback_max > scrollback_phys_max) + scrollback_max = scrollback_phys_max; + scrollback_current = 0; +} + +static __inline__ void ywrap_down(struct vc_data *vc, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + p->yscroll -= count; + if (p->yscroll < 0) /* Deal with wrap */ + p->yscroll += p->vrows; + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode |= FB_VMODE_YWRAP; + ops->update_start(info); + scrollback_max -= count; + if (scrollback_max < 0) + scrollback_max = 0; + scrollback_current = 0; +} + +static __inline__ void ypan_up(struct vc_data *vc, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_display *p = &fb_display[vc->vc_num]; + struct fbcon_ops *ops = info->fbcon_par; + + p->yscroll += count; + if (p->yscroll > p->vrows - vc->vc_rows) { + ops->bmove(vc, info, p->vrows - vc->vc_rows, + 0, 0, 0, vc->vc_rows, vc->vc_cols); + p->yscroll -= p->vrows - vc->vc_rows; + } + + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode &= ~FB_VMODE_YWRAP; + ops->update_start(info); + fbcon_clear_margins(vc, 1); + scrollback_max += count; + if (scrollback_max > scrollback_phys_max) + scrollback_max = scrollback_phys_max; + scrollback_current = 0; +} + +static __inline__ void ypan_up_redraw(struct vc_data *vc, int t, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + p->yscroll += count; + + if (p->yscroll > p->vrows - vc->vc_rows) { + p->yscroll -= p->vrows - vc->vc_rows; + fbcon_redraw_move(vc, p, t + count, vc->vc_rows - count, t); + } + + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode &= ~FB_VMODE_YWRAP; + ops->update_start(info); + fbcon_clear_margins(vc, 1); + scrollback_max += count; + if (scrollback_max > scrollback_phys_max) + scrollback_max = scrollback_phys_max; + scrollback_current = 0; +} + +static __inline__ void ypan_down(struct vc_data *vc, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_display *p = &fb_display[vc->vc_num]; + struct fbcon_ops *ops = info->fbcon_par; + + p->yscroll -= count; + if (p->yscroll < 0) { + ops->bmove(vc, info, 0, 0, p->vrows - vc->vc_rows, + 0, vc->vc_rows, vc->vc_cols); + p->yscroll += p->vrows - vc->vc_rows; + } + + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode &= ~FB_VMODE_YWRAP; + ops->update_start(info); + fbcon_clear_margins(vc, 1); + scrollback_max -= count; + if (scrollback_max < 0) + scrollback_max = 0; + scrollback_current = 0; +} + +static __inline__ void ypan_down_redraw(struct vc_data *vc, int t, int count) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + p->yscroll -= count; + + if (p->yscroll < 0) { + p->yscroll += p->vrows - vc->vc_rows; + fbcon_redraw_move(vc, p, t, vc->vc_rows - count, t + count); + } + + ops->var.xoffset = 0; + ops->var.yoffset = p->yscroll * vc->vc_font.height; + ops->var.vmode &= ~FB_VMODE_YWRAP; + ops->update_start(info); + fbcon_clear_margins(vc, 1); + scrollback_max -= count; + if (scrollback_max < 0) + scrollback_max = 0; + scrollback_current = 0; +} + +static void fbcon_redraw_move(struct vc_data *vc, struct fbcon_display *p, + int line, int count, int dy) +{ + unsigned short *s = (unsigned short *) + (vc->vc_origin + vc->vc_size_row * line); + + while (count--) { + unsigned short *start = s; + unsigned short *le = advance_row(s, 1); + unsigned short c; + int x = 0; + unsigned short attr = 1; + + do { + c = scr_readw(s); + if (attr != (c & 0xff00)) { + attr = c & 0xff00; + if (s > start) { + fbcon_putcs(vc, start, s - start, + dy, x); + x += s - start; + start = s; + } + } + console_conditional_schedule(); + s++; + } while (s < le); + if (s > start) + fbcon_putcs(vc, start, s - start, dy, x); + console_conditional_schedule(); + dy++; + } +} + +static void fbcon_redraw_blit(struct vc_data *vc, struct fb_info *info, + struct fbcon_display *p, int line, int count, int ycount) +{ + int offset = ycount * vc->vc_cols; + unsigned short *d = (unsigned short *) + (vc->vc_origin + vc->vc_size_row * line); + unsigned short *s = d + offset; + struct fbcon_ops *ops = info->fbcon_par; + + while (count--) { + unsigned short *start = s; + unsigned short *le = advance_row(s, 1); + unsigned short c; + int x = 0; + + do { + c = scr_readw(s); + + if (c == scr_readw(d)) { + if (s > start) { + ops->bmove(vc, info, line + ycount, x, + line, x, 1, s-start); + x += s - start + 1; + start = s + 1; + } else { + x++; + start++; + } + } + + scr_writew(c, d); + console_conditional_schedule(); + s++; + d++; + } while (s < le); + if (s > start) + ops->bmove(vc, info, line + ycount, x, line, x, 1, + s-start); + console_conditional_schedule(); + if (ycount > 0) + line++; + else { + line--; + /* NOTE: We subtract two lines from these pointers */ + s -= vc->vc_size_row; + d -= vc->vc_size_row; + } + } +} + static void fbcon_redraw(struct vc_data *vc, struct fbcon_display *p, int line, int count, int offset) { @@ -1450,6 +1687,7 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, { struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; struct fbcon_display *p = &fb_display[vc->vc_num]; + int scroll_partial = info->flags & FBINFO_PARTIAL_PAN_OK; if (fbcon_is_inactive(vc, info)) return true; @@ -1466,32 +1704,249 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, case SM_UP: if (count > vc->vc_rows) /* Maximum realistic size */ count = vc->vc_rows; - fbcon_redraw(vc, p, t, b - t - count, - count * vc->vc_cols); - fbcon_clear(vc, b - count, 0, count, vc->vc_cols); - scr_memsetw((unsigned short *) (vc->vc_origin + - vc->vc_size_row * - (b - count)), - vc->vc_video_erase_char, - vc->vc_size_row * count); - return true; + if (logo_shown >= 0) + goto redraw_up; + switch (p->scrollmode) { + case SCROLL_MOVE: + fbcon_redraw_blit(vc, info, p, t, b - t - count, + count); + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + scr_memsetw((unsigned short *) (vc->vc_origin + + vc->vc_size_row * + (b - count)), + vc->vc_video_erase_char, + vc->vc_size_row * count); + return true; + + case SCROLL_WRAP_MOVE: + if (b - t - count > 3 * vc->vc_rows >> 2) { + if (t > 0) + fbcon_bmove(vc, 0, 0, count, 0, t, + vc->vc_cols); + ywrap_up(vc, count); + if (vc->vc_rows - b > 0) + fbcon_bmove(vc, b - count, 0, b, 0, + vc->vc_rows - b, + vc->vc_cols); + } else if (info->flags & FBINFO_READS_FAST) + fbcon_bmove(vc, t + count, 0, t, 0, + b - t - count, vc->vc_cols); + else + goto redraw_up; + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + break; + + case SCROLL_PAN_REDRAW: + if ((p->yscroll + count <= + 2 * (p->vrows - vc->vc_rows)) + && ((!scroll_partial && (b - t == vc->vc_rows)) + || (scroll_partial + && (b - t - count > + 3 * vc->vc_rows >> 2)))) { + if (t > 0) + fbcon_redraw_move(vc, p, 0, t, count); + ypan_up_redraw(vc, t, count); + if (vc->vc_rows - b > 0) + fbcon_redraw_move(vc, p, b, + vc->vc_rows - b, b); + } else + fbcon_redraw_move(vc, p, t + count, b - t - count, t); + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + break; + + case SCROLL_PAN_MOVE: + if ((p->yscroll + count <= + 2 * (p->vrows - vc->vc_rows)) + && ((!scroll_partial && (b - t == vc->vc_rows)) + || (scroll_partial + && (b - t - count > + 3 * vc->vc_rows >> 2)))) { + if (t > 0) + fbcon_bmove(vc, 0, 0, count, 0, t, + vc->vc_cols); + ypan_up(vc, count); + if (vc->vc_rows - b > 0) + fbcon_bmove(vc, b - count, 0, b, 0, + vc->vc_rows - b, + vc->vc_cols); + } else if (info->flags & FBINFO_READS_FAST) + fbcon_bmove(vc, t + count, 0, t, 0, + b - t - count, vc->vc_cols); + else + goto redraw_up; + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + break; + + case SCROLL_REDRAW: + redraw_up: + fbcon_redraw(vc, p, t, b - t - count, + count * vc->vc_cols); + fbcon_clear(vc, b - count, 0, count, vc->vc_cols); + scr_memsetw((unsigned short *) (vc->vc_origin + + vc->vc_size_row * + (b - count)), + vc->vc_video_erase_char, + vc->vc_size_row * count); + return true; + } + break; case SM_DOWN: if (count > vc->vc_rows) /* Maximum realistic size */ count = vc->vc_rows; - fbcon_redraw(vc, p, b - 1, b - t - count, - -count * vc->vc_cols); - fbcon_clear(vc, t, 0, count, vc->vc_cols); - scr_memsetw((unsigned short *) (vc->vc_origin + - vc->vc_size_row * - t), - vc->vc_video_erase_char, - vc->vc_size_row * count); - return true; + if (logo_shown >= 0) + goto redraw_down; + switch (p->scrollmode) { + case SCROLL_MOVE: + fbcon_redraw_blit(vc, info, p, b - 1, b - t - count, + -count); + fbcon_clear(vc, t, 0, count, vc->vc_cols); + scr_memsetw((unsigned short *) (vc->vc_origin + + vc->vc_size_row * + t), + vc->vc_video_erase_char, + vc->vc_size_row * count); + return true; + + case SCROLL_WRAP_MOVE: + if (b - t - count > 3 * vc->vc_rows >> 2) { + if (vc->vc_rows - b > 0) + fbcon_bmove(vc, b, 0, b - count, 0, + vc->vc_rows - b, + vc->vc_cols); + ywrap_down(vc, count); + if (t > 0) + fbcon_bmove(vc, count, 0, 0, 0, t, + vc->vc_cols); + } else if (info->flags & FBINFO_READS_FAST) + fbcon_bmove(vc, t, 0, t + count, 0, + b - t - count, vc->vc_cols); + else + goto redraw_down; + fbcon_clear(vc, t, 0, count, vc->vc_cols); + break; + + case SCROLL_PAN_MOVE: + if ((count - p->yscroll <= p->vrows - vc->vc_rows) + && ((!scroll_partial && (b - t == vc->vc_rows)) + || (scroll_partial + && (b - t - count > + 3 * vc->vc_rows >> 2)))) { + if (vc->vc_rows - b > 0) + fbcon_bmove(vc, b, 0, b - count, 0, + vc->vc_rows - b, + vc->vc_cols); + ypan_down(vc, count); + if (t > 0) + fbcon_bmove(vc, count, 0, 0, 0, t, + vc->vc_cols); + } else if (info->flags & FBINFO_READS_FAST) + fbcon_bmove(vc, t, 0, t + count, 0, + b - t - count, vc->vc_cols); + else + goto redraw_down; + fbcon_clear(vc, t, 0, count, vc->vc_cols); + break; + + case SCROLL_PAN_REDRAW: + if ((count - p->yscroll <= p->vrows - vc->vc_rows) + && ((!scroll_partial && (b - t == vc->vc_rows)) + || (scroll_partial + && (b - t - count > + 3 * vc->vc_rows >> 2)))) { + if (vc->vc_rows - b > 0) + fbcon_redraw_move(vc, p, b, vc->vc_rows - b, + b - count); + ypan_down_redraw(vc, t, count); + if (t > 0) + fbcon_redraw_move(vc, p, count, t, 0); + } else + fbcon_redraw_move(vc, p, t, b - t - count, t + count); + fbcon_clear(vc, t, 0, count, vc->vc_cols); + break; + + case SCROLL_REDRAW: + redraw_down: + fbcon_redraw(vc, p, b - 1, b - t - count, + -count * vc->vc_cols); + fbcon_clear(vc, t, 0, count, vc->vc_cols); + scr_memsetw((unsigned short *) (vc->vc_origin + + vc->vc_size_row * + t), + vc->vc_video_erase_char, + vc->vc_size_row * count); + return true; + } } return false; } + +static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx, + int height, int width) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_display *p = &fb_display[vc->vc_num]; + + if (fbcon_is_inactive(vc, info)) + return; + + if (!width || !height) + return; + + /* Split blits that cross physical y_wrap case. + * Pathological case involves 4 blits, better to use recursive + * code rather than unrolled case + * + * Recursive invocations don't need to erase the cursor over and + * over again, so we use fbcon_bmove_rec() + */ + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, height, width, + p->vrows - p->yscroll); +} + +static void fbcon_bmove_rec(struct vc_data *vc, struct fbcon_display *p, int sy, int sx, + int dy, int dx, int height, int width, u_int y_break) +{ + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; + u_int b; + + if (sy < y_break && sy + height > y_break) { + b = y_break - sy; + if (dy < sy) { /* Avoid trashing self */ + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, + y_break); + fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, + height - b, width, y_break); + } else { + fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, + height - b, width, y_break); + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, + y_break); + } + return; + } + + if (dy < y_break && dy + height > y_break) { + b = y_break - dy; + if (dy < sy) { /* Avoid trashing self */ + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, + y_break); + fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, + height - b, width, y_break); + } else { + fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, + height - b, width, y_break); + fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, + y_break); + } + return; + } + ops->bmove(vc, info, real_y(p, sy), sx, real_y(p, dy), dx, + height, width); +} + static void updatescrollmode(struct fbcon_display *p, struct fb_info *info, struct vc_data *vc) @@ -1664,7 +2119,21 @@ static int fbcon_switch(struct vc_data *vc) updatescrollmode(p, info, vc); - scrollback_phys_max = 0; + switch (p->scrollmode) { + case SCROLL_WRAP_MOVE: + scrollback_phys_max = p->vrows - vc->vc_rows; + break; + case SCROLL_PAN_MOVE: + case SCROLL_PAN_REDRAW: + scrollback_phys_max = p->vrows - 2 * vc->vc_rows; + if (scrollback_phys_max < 0) + scrollback_phys_max = 0; + break; + default: + scrollback_phys_max = 0; + break; + } + scrollback_max = 0; scrollback_current = 0; diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index a00603b4451a..5246d0f2574b 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -29,6 +29,7 @@ struct fbcon_display { /* Filled in by the low-level console driver */ const u_char *fontdata; int userfont; /* != 0 if fontdata kmalloc()ed */ + u_short scrollmode; /* Scroll Method */ u_short inverse; /* != 0 text black on white as default */ short yscroll; /* Hardware scrolling */ int vrows; /* number of virtual rows */ @@ -51,6 +52,8 @@ struct fbcon_display { }; struct fbcon_ops { + void (*bmove)(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width); void (*clear)(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width); void (*putcs)(struct vc_data *vc, struct fb_info *info, @@ -149,6 +152,62 @@ static inline int attr_col_ec(int shift, struct vc_data *vc, #define attr_bgcol_ec(bgshift, vc, info) attr_col_ec(bgshift, vc, info, 0) #define attr_fgcol_ec(fgshift, vc, info) attr_col_ec(fgshift, vc, info, 1) + /* + * Scroll Method + */ + +/* There are several methods fbcon can use to move text around the screen: + * + * Operation Pan Wrap + *--------------------------------------------- + * SCROLL_MOVE copyarea No No + * SCROLL_PAN_MOVE copyarea Yes No + * SCROLL_WRAP_MOVE copyarea No Yes + * SCROLL_REDRAW imageblit No No + * SCROLL_PAN_REDRAW imageblit Yes No + * SCROLL_WRAP_REDRAW imageblit No Yes + * + * (SCROLL_WRAP_REDRAW is not implemented yet) + * + * In general, fbcon will choose the best scrolling + * method based on the rule below: + * + * Pan/Wrap > accel imageblit > accel copyarea > + * soft imageblit > (soft copyarea) + * + * Exception to the rule: Pan + accel copyarea is + * preferred over Pan + accel imageblit. + * + * The above is typical for PCI/AGP cards. Unless + * overridden, fbcon will never use soft copyarea. + * + * If you need to override the above rule, set the + * appropriate flags in fb_info->flags. For example, + * to prefer copyarea over imageblit, set + * FBINFO_READS_FAST. + * + * Other notes: + * + use the hardware engine to move the text + * (hw-accelerated copyarea() and fillrect()) + * + use hardware-supported panning on a large virtual screen + * + amifb can not only pan, but also wrap the display by N lines + * (i.e. visible line i = physical line (i+N) % yres). + * + read what's already rendered on the screen and + * write it in a different place (this is cfb_copyarea()) + * + re-render the text to the screen + * + * Whether to use wrapping or panning can only be figured out at + * runtime (when we know whether our font height is a multiple + * of the pan/wrap step) + * + */ + +#define SCROLL_MOVE 0x001 +#define SCROLL_PAN_MOVE 0x002 +#define SCROLL_WRAP_MOVE 0x003 +#define SCROLL_REDRAW 0x004 +#define SCROLL_PAN_REDRAW 0x005 + #ifdef CONFIG_FB_TILEBLITTING extern void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info); #endif diff --git a/drivers/video/fbdev/core/fbcon_ccw.c b/drivers/video/fbdev/core/fbcon_ccw.c index ffa78936eaab..9cd2c4b05c32 100644 --- a/drivers/video/fbdev/core/fbcon_ccw.c +++ b/drivers/video/fbdev/core/fbcon_ccw.c @@ -59,12 +59,31 @@ static void ccw_update_attr(u8 *dst, u8 *src, int attribute, } } + +static void ccw_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct fb_copyarea area; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + + area.sx = sy * vc->vc_font.height; + area.sy = vyres - ((sx + width) * vc->vc_font.width); + area.dx = dy * vc->vc_font.height; + area.dy = vyres - ((dx + width) * vc->vc_font.width); + area.width = height * vc->vc_font.height; + area.height = width * vc->vc_font.width; + + info->fbops->fb_copyarea(info, &area); +} + static void ccw_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { + struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vyres = info->var.yres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dx = sy * vc->vc_font.height; @@ -121,7 +140,7 @@ static void ccw_putcs(struct vc_data *vc, struct fb_info *info, u32 cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vyres = info->var.yres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -210,7 +229,7 @@ static void ccw_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vyres = info->var.yres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -368,7 +387,7 @@ static int ccw_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; u32 yoffset; - u32 vyres = info->var.yres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); int err; yoffset = (vyres - info->var.yres) - ops->var.xoffset; @@ -383,6 +402,7 @@ static int ccw_update_start(struct fb_info *info) void fbcon_rotate_ccw(struct fbcon_ops *ops) { + ops->bmove = ccw_bmove; ops->clear = ccw_clear; ops->putcs = ccw_putcs; ops->clear_margins = ccw_clear_margins; diff --git a/drivers/video/fbdev/core/fbcon_cw.c b/drivers/video/fbdev/core/fbcon_cw.c index 92e5b7fb51ee..88d89fad3f05 100644 --- a/drivers/video/fbdev/core/fbcon_cw.c +++ b/drivers/video/fbdev/core/fbcon_cw.c @@ -44,12 +44,31 @@ static void cw_update_attr(u8 *dst, u8 *src, int attribute, } } + +static void cw_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct fb_copyarea area; + u32 vxres = GETVXRES(ops->p->scrollmode, info); + + area.sx = vxres - ((sy + height) * vc->vc_font.height); + area.sy = sx * vc->vc_font.width; + area.dx = vxres - ((dy + height) * vc->vc_font.height); + area.dy = dx * vc->vc_font.width; + area.width = height * vc->vc_font.height; + area.height = width * vc->vc_font.width; + + info->fbops->fb_copyarea(info, &area); +} + static void cw_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { + struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vxres = info->var.xres; + u32 vxres = GETVXRES(ops->p->scrollmode, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dx = vxres - ((sy + height) * vc->vc_font.height); @@ -106,7 +125,7 @@ static void cw_putcs(struct vc_data *vc, struct fb_info *info, u32 cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vxres = info->var.xres; + u32 vxres = GETVXRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -193,7 +212,7 @@ static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vxres = info->var.xres; + u32 vxres = GETVXRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -350,7 +369,7 @@ static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode, static int cw_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; - u32 vxres = info->var.xres; + u32 vxres = GETVXRES(ops->p->scrollmode, info); u32 xoffset; int err; @@ -366,6 +385,7 @@ static int cw_update_start(struct fb_info *info) void fbcon_rotate_cw(struct fbcon_ops *ops) { + ops->bmove = cw_bmove; ops->clear = cw_clear; ops->putcs = cw_putcs; ops->clear_margins = cw_clear_margins; diff --git a/drivers/video/fbdev/core/fbcon_rotate.h b/drivers/video/fbdev/core/fbcon_rotate.h index b528b2e54283..e233444cda66 100644 --- a/drivers/video/fbdev/core/fbcon_rotate.h +++ b/drivers/video/fbdev/core/fbcon_rotate.h @@ -11,6 +11,15 @@ #ifndef _FBCON_ROTATE_H #define _FBCON_ROTATE_H +#define GETVYRES(s,i) ({ \ + (s == SCROLL_REDRAW || s == SCROLL_MOVE) ? \ + (i)->var.yres : (i)->var.yres_virtual; }) + +#define GETVXRES(s,i) ({ \ + (s == SCROLL_REDRAW || s == SCROLL_MOVE || !(i)->fix.xpanstep) ? \ + (i)->var.xres : (i)->var.xres_virtual; }) + + static inline int pattern_test_bit(u32 x, u32 y, u32 pitch, const char *pat) { u32 tmp = (y * pitch) + x, index = tmp / 8, bit = tmp % 8; diff --git a/drivers/video/fbdev/core/fbcon_ud.c b/drivers/video/fbdev/core/fbcon_ud.c index 09619bd8e021..8d5e66b1bdfb 100644 --- a/drivers/video/fbdev/core/fbcon_ud.c +++ b/drivers/video/fbdev/core/fbcon_ud.c @@ -44,13 +44,33 @@ static void ud_update_attr(u8 *dst, u8 *src, int attribute, } } + +static void ud_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct fb_copyarea area; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); + + area.sy = vyres - ((sy + height) * vc->vc_font.height); + area.sx = vxres - ((sx + width) * vc->vc_font.width); + area.dy = vyres - ((dy + height) * vc->vc_font.height); + area.dx = vxres - ((dx + width) * vc->vc_font.width); + area.height = height * vc->vc_font.height; + area.width = width * vc->vc_font.width; + + info->fbops->fb_copyarea(info, &area); +} + static void ud_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { + struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - u32 vyres = info->var.yres; - u32 vxres = info->var.xres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); region.color = attr_bgcol_ec(bgshift,vc,info); region.dy = vyres - ((sy + height) * vc->vc_font.height); @@ -142,8 +162,8 @@ static void ud_putcs(struct vc_data *vc, struct fb_info *info, u32 mod = vc->vc_font.width % 8, cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; - u32 vyres = info->var.yres; - u32 vxres = info->var.xres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -239,8 +259,8 @@ static void ud_cursor(struct vc_data *vc, struct fb_info *info, int mode, int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1, dx, dy; char *src; - u32 vyres = info->var.yres; - u32 vxres = info->var.xres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); if (!ops->fontbuffer) return; @@ -390,8 +410,8 @@ static int ud_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; int xoffset, yoffset; - u32 vyres = info->var.yres; - u32 vxres = info->var.xres; + u32 vyres = GETVYRES(ops->p->scrollmode, info); + u32 vxres = GETVXRES(ops->p->scrollmode, info); int err; xoffset = vxres - info->var.xres - ops->var.xoffset; @@ -409,6 +429,7 @@ static int ud_update_start(struct fb_info *info) void fbcon_rotate_ud(struct fbcon_ops *ops) { + ops->bmove = ud_bmove; ops->clear = ud_clear; ops->putcs = ud_putcs; ops->clear_margins = ud_clear_margins; diff --git a/drivers/video/fbdev/core/tileblit.c b/drivers/video/fbdev/core/tileblit.c index 72af95053bcb..2768eff247ba 100644 --- a/drivers/video/fbdev/core/tileblit.c +++ b/drivers/video/fbdev/core/tileblit.c @@ -16,6 +16,21 @@ #include #include "fbcon.h" +static void tile_bmove(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int dy, int dx, int height, int width) +{ + struct fb_tilearea area; + + area.sx = sx; + area.sy = sy; + area.dx = dx; + area.dy = dy; + area.height = height; + area.width = width; + + info->tileops->fb_tilecopy(info, &area); +} + static void tile_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { @@ -118,6 +133,7 @@ void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info) struct fb_tilemap map; struct fbcon_ops *ops = info->fbcon_par; + ops->bmove = tile_bmove; ops->clear = tile_clear; ops->putcs = tile_putcs; ops->clear_margins = tile_clear_margins; diff --git a/drivers/video/fbdev/skeletonfb.c b/drivers/video/fbdev/skeletonfb.c index 0fe922f726e9..bcacfb6934fa 100644 --- a/drivers/video/fbdev/skeletonfb.c +++ b/drivers/video/fbdev/skeletonfb.c @@ -505,15 +505,15 @@ void xxxfb_fillrect(struct fb_info *p, const struct fb_fillrect *region) } /** - * xxxfb_copyarea - OBSOLETE function. + * xxxfb_copyarea - REQUIRED function. Can use generic routines if + * non acclerated hardware and packed pixel based. * Copies one area of the screen to another area. - * Will be deleted in a future version * * @info: frame buffer structure that represents a single frame buffer * @area: Structure providing the data to copy the framebuffer contents * from one region to another. * - * This drawing operation copied a rectangular area from one area of the + * This drawing operation copies a rectangular area from one area of the * screen to another area. */ void xxxfb_copyarea(struct fb_info *p, const struct fb_copyarea *area) @@ -645,9 +645,9 @@ static const struct fb_ops xxxfb_ops = { .fb_setcolreg = xxxfb_setcolreg, .fb_blank = xxxfb_blank, .fb_pan_display = xxxfb_pan_display, - .fb_fillrect = xxxfb_fillrect, /* Needed !!! */ - .fb_copyarea = xxxfb_copyarea, /* Obsolete */ - .fb_imageblit = xxxfb_imageblit, /* Needed !!! */ + .fb_fillrect = xxxfb_fillrect, /* Needed !!! */ + .fb_copyarea = xxxfb_copyarea, /* Needed !!! */ + .fb_imageblit = xxxfb_imageblit, /* Needed !!! */ .fb_cursor = xxxfb_cursor, /* Optional !!! */ .fb_sync = xxxfb_sync, .fb_ioctl = xxxfb_ioctl, diff --git a/include/linux/fb.h b/include/linux/fb.h index 3da95842b207..02f362c661c8 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -262,7 +262,7 @@ struct fb_ops { /* Draws a rectangle */ void (*fb_fillrect) (struct fb_info *info, const struct fb_fillrect *rect); - /* Copy data from area to another. Obsolete. */ + /* Copy data from area to another */ void (*fb_copyarea) (struct fb_info *info, const struct fb_copyarea *region); /* Draws a image to the display */ void (*fb_imageblit) (struct fb_info *info, const struct fb_image *image); -- cgit v1.2.3 From c86d86131ab75696fc52d98571148842e067d620 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Wed, 2 Feb 2022 06:09:04 +0300 Subject: Partially revert "net/smc: Add netlink net namespace support" The change of sizeof(struct smc_diag_linkinfo) by commit 79d39fc503b4 ("net/smc: Add netlink net namespace support") introduced an ABI regression: since struct smc_diag_lgrinfo contains an object of type "struct smc_diag_linkinfo", offset of all subsequent members of struct smc_diag_lgrinfo was changed by that change. As result, applications compiled with the old version of struct smc_diag_linkinfo will receive garbage in struct smc_diag_lgrinfo.role if the kernel implements this new version of struct smc_diag_linkinfo. Fix this regression by reverting the part of commit 79d39fc503b4 that changes struct smc_diag_linkinfo. After all, there is SMC_GEN_NETLINK interface which is good enough, so there is probably no need to touch the smc_diag ABI in the first place. Fixes: 79d39fc503b4 ("net/smc: Add netlink net namespace support") Signed-off-by: Dmitry V. Levin Reviewed-by: Karsten Graul Link: https://lore.kernel.org/r/20220202030904.GA9742@altlinux.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/smc_diag.h | 11 +++++------ net/smc/smc_diag.c | 2 -- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index c7008d87f1a4..8cb3a6fef553 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -84,12 +84,11 @@ struct smc_diag_conninfo { /* SMC_DIAG_LINKINFO */ struct smc_diag_linkinfo { - __u8 link_id; /* link identifier */ - __u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */ - __u8 ibport; /* RDMA device port number */ - __u8 gid[40]; /* local GID */ - __u8 peer_gid[40]; /* peer GID */ - __aligned_u64 net_cookie; /* RDMA device net namespace */ + __u8 link_id; /* link identifier */ + __u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */ + __u8 ibport; /* RDMA device port number */ + __u8 gid[40]; /* local GID */ + __u8 peer_gid[40]; /* peer GID */ }; struct smc_diag_lgrinfo { diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index b8898c787d23..1fca2f90a9c7 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -146,13 +146,11 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_link *link = smc->conn.lnk; - struct net *net = read_pnet(&link->smcibdev->ibdev->coredev.rdma_net); struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, .lnk[0].ibport = link->ibport, .lnk[0].link_id = link->link_id, - .lnk[0].net_cookie = net->net_cookie, }; memcpy(linfo.lnk[0].ibname, -- cgit v1.2.3 From e1d2699b96793d19388e302fa095e0da2c145701 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Jan 2022 22:10:52 -0500 Subject: NFS: Avoid duplicate uncached readdir calls on eof If we've reached the end of the directory, then cache that information in the context so that we don't need to do an uncached readdir in order to rediscover that fact. Fixes: 794092c57f89 ("NFS: Do uncached readdir when we're seeking a cookie in an empty page cache") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 20 +++++++++++++++----- include/linux/nfs_fs.h | 1 + 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a3de586d21e2..7bc7cf6b26f0 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -80,6 +80,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir ctx->dir_cookie = 0; ctx->dup_cookie = 0; ctx->page_index = 0; + ctx->eof = false; spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) @@ -168,6 +169,7 @@ struct nfs_readdir_descriptor { unsigned int cache_entry_index; signed char duped; bool plus; + bool eob; bool eof; }; @@ -989,7 +991,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { - desc->eof = true; + desc->eob = true; break; } memcpy(desc->verf, verf, sizeof(desc->verf)); @@ -1005,7 +1007,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, desc->duped = 1; } if (array->page_is_eof) - desc->eof = true; + desc->eof = !desc->eob; kunmap(desc->page); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", @@ -1048,7 +1050,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); - for (i = 0; !desc->eof && i < sz && arrays[i]; i++) { + for (i = 0; !desc->eob && i < sz && arrays[i]; i++) { desc->page = arrays[i]; nfs_do_filldir(desc, verf); } @@ -1107,9 +1109,15 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->duped = dir_ctx->duped; page_index = dir_ctx->page_index; desc->attr_gencount = dir_ctx->attr_gencount; + desc->eof = dir_ctx->eof; memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); spin_unlock(&file->f_lock); + if (desc->eof) { + res = 0; + goto out_free; + } + if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) && list_is_singular(&nfsi->open_files)) invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1); @@ -1143,7 +1151,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) nfs_do_filldir(desc, nfsi->cookieverf); nfs_readdir_page_unlock_and_put_cached(desc); - } while (!desc->eof); + } while (!desc->eob && !desc->eof); spin_lock(&file->f_lock); dir_ctx->dir_cookie = desc->dir_cookie; @@ -1151,9 +1159,10 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) dir_ctx->duped = desc->duped; dir_ctx->attr_gencount = desc->attr_gencount; dir_ctx->page_index = desc->page_index; + dir_ctx->eof = desc->eof; memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); spin_unlock(&file->f_lock); - +out_free: kfree(desc); out: @@ -1195,6 +1204,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) if (offset == 0) memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf)); dir_ctx->duped = 0; + dir_ctx->eof = false; } spin_unlock(&filp->f_lock); return offset; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 02aa49323d1d..68f81d8d36de 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -107,6 +107,7 @@ struct nfs_open_dir_context { __u64 dup_cookie; pgoff_t page_index; signed char duped; + bool eof; }; /* -- cgit v1.2.3 From 2ea88716369ac9a7486a8cb309d6bf1239ea156c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 23 Jan 2022 17:27:47 +0100 Subject: libceph: make recv path in secure mode work the same as send path The recv path of secure mode is intertwined with that of crc mode. While it's slightly more efficient that way (the ciphertext is read into the destination buffer and decrypted in place, thus avoiding two potentially heavy memory allocations for the bounce buffer and the corresponding sg array), it isn't really amenable to changes. Sacrifice that edge and align with the send path which always uses a full-sized bounce buffer (currently there is no other way -- if the kernel crypto API ever grows support for streaming (piecewise) en/decryption for GCM [1], we would be able to easily take advantage of that on both sides). [1] https://lore.kernel.org/all/20141225202830.GA18794@gondor.apana.org.au/ Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/messenger.h | 4 + net/ceph/messenger_v2.c | 216 +++++++++++++++++++++++++++++------------ 2 files changed, 158 insertions(+), 62 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index ff99ce094cfa..6c6b6ea52bb8 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -383,6 +383,10 @@ struct ceph_connection_v2_info { struct ceph_gcm_nonce in_gcm_nonce; struct ceph_gcm_nonce out_gcm_nonce; + struct page **in_enc_pages; + int in_enc_page_cnt; + int in_enc_resid; + int in_enc_i; struct page **out_enc_pages; int out_enc_page_cnt; int out_enc_resid; diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index c4099b641b38..2ea00489e691 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -57,8 +57,9 @@ #define IN_S_HANDLE_CONTROL_REMAINDER 3 #define IN_S_PREPARE_READ_DATA 4 #define IN_S_PREPARE_READ_DATA_CONT 5 -#define IN_S_HANDLE_EPILOGUE 6 -#define IN_S_FINISH_SKIP 7 +#define IN_S_PREPARE_READ_ENC_PAGE 6 +#define IN_S_HANDLE_EPILOGUE 7 +#define IN_S_FINISH_SKIP 8 #define OUT_S_QUEUE_DATA 1 #define OUT_S_QUEUE_DATA_CONT 2 @@ -1032,22 +1033,41 @@ static int decrypt_control_remainder(struct ceph_connection *con) padded_len(rem_len) + CEPH_GCM_TAG_LEN); } -static int decrypt_message(struct ceph_connection *con) +static int decrypt_tail(struct ceph_connection *con) { + struct sg_table enc_sgt = {}; struct sg_table sgt = {}; + int tail_len; int ret; + tail_len = tail_onwire_len(con->in_msg, true); + ret = sg_alloc_table_from_pages(&enc_sgt, con->v2.in_enc_pages, + con->v2.in_enc_page_cnt, 0, tail_len, + GFP_NOIO); + if (ret) + goto out; + ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf), MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf), con->v2.in_buf, true); if (ret) goto out; - ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl, - tail_onwire_len(con->in_msg, true)); + dout("%s con %p msg %p enc_page_cnt %d sg_cnt %d\n", __func__, con, + con->in_msg, con->v2.in_enc_page_cnt, sgt.orig_nents); + ret = gcm_crypt(con, false, enc_sgt.sgl, sgt.sgl, tail_len); + if (ret) + goto out; + + WARN_ON(!con->v2.in_enc_page_cnt); + ceph_release_page_vector(con->v2.in_enc_pages, + con->v2.in_enc_page_cnt); + con->v2.in_enc_pages = NULL; + con->v2.in_enc_page_cnt = 0; out: sg_free_table(&sgt); + sg_free_table(&enc_sgt); return ret; } @@ -1737,8 +1757,7 @@ static void prepare_read_data(struct ceph_connection *con) { struct bio_vec bv; - if (!con_secure(con)) - con->in_data_crc = -1; + con->in_data_crc = -1; ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg, data_len(con->in_msg)); @@ -1751,11 +1770,10 @@ static void prepare_read_data_cont(struct ceph_connection *con) { struct bio_vec bv; - if (!con_secure(con)) - con->in_data_crc = ceph_crc32c_page(con->in_data_crc, - con->v2.in_bvec.bv_page, - con->v2.in_bvec.bv_offset, - con->v2.in_bvec.bv_len); + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, + con->v2.in_bvec.bv_page, + con->v2.in_bvec.bv_offset, + con->v2.in_bvec.bv_len); ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len); if (con->v2.in_cursor.total_resid) { @@ -1766,21 +1784,94 @@ static void prepare_read_data_cont(struct ceph_connection *con) } /* - * We've read all data. Prepare to read data padding (if any) - * and epilogue. + * We've read all data. Prepare to read epilogue. */ reset_in_kvecs(con); - if (con_secure(con)) { - if (need_padding(data_len(con->in_msg))) - add_in_kvec(con, DATA_PAD(con->v2.in_buf), - padding_len(data_len(con->in_msg))); - add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN); + add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; +} + +static void prepare_read_tail_plain(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->in_msg; + + if (!front_len(msg) && !middle_len(msg)) { + WARN_ON(!data_len(msg)); + prepare_read_data(con); + return; + } + + reset_in_kvecs(con); + if (front_len(msg)) { + add_in_kvec(con, msg->front.iov_base, front_len(msg)); + WARN_ON(msg->front.iov_len != front_len(msg)); + } + if (middle_len(msg)) { + add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); + WARN_ON(msg->middle->vec.iov_len != middle_len(msg)); + } + + if (data_len(msg)) { + con->v2.in_state = IN_S_PREPARE_READ_DATA; } else { add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; + } +} + +static void prepare_read_enc_page(struct ceph_connection *con) +{ + struct bio_vec bv; + + dout("%s con %p i %d resid %d\n", __func__, con, con->v2.in_enc_i, + con->v2.in_enc_resid); + WARN_ON(!con->v2.in_enc_resid); + + bv.bv_page = con->v2.in_enc_pages[con->v2.in_enc_i]; + bv.bv_offset = 0; + bv.bv_len = min(con->v2.in_enc_resid, (int)PAGE_SIZE); + + set_in_bvec(con, &bv); + con->v2.in_enc_i++; + con->v2.in_enc_resid -= bv.bv_len; + + if (con->v2.in_enc_resid) { + con->v2.in_state = IN_S_PREPARE_READ_ENC_PAGE; + return; } + + /* + * We are set to read the last piece of ciphertext (ending + * with epilogue) + auth tag. + */ + WARN_ON(con->v2.in_enc_i != con->v2.in_enc_page_cnt); con->v2.in_state = IN_S_HANDLE_EPILOGUE; } +static int prepare_read_tail_secure(struct ceph_connection *con) +{ + struct page **enc_pages; + int enc_page_cnt; + int tail_len; + + tail_len = tail_onwire_len(con->in_msg, true); + WARN_ON(!tail_len); + + enc_page_cnt = calc_pages_for(0, tail_len); + enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO); + if (IS_ERR(enc_pages)) + return PTR_ERR(enc_pages); + + WARN_ON(con->v2.in_enc_pages || con->v2.in_enc_page_cnt); + con->v2.in_enc_pages = enc_pages; + con->v2.in_enc_page_cnt = enc_page_cnt; + con->v2.in_enc_resid = tail_len; + con->v2.in_enc_i = 0; + + prepare_read_enc_page(con); + return 0; +} + static void __finish_skip(struct ceph_connection *con) { con->in_seq++; @@ -2589,46 +2680,26 @@ static int __handle_control(struct ceph_connection *con, void *p) } msg = con->in_msg; /* set in process_message_header() */ - if (!front_len(msg) && !middle_len(msg)) { - if (!data_len(msg)) - return process_message(con); - - prepare_read_data(con); - return 0; - } - - reset_in_kvecs(con); if (front_len(msg)) { WARN_ON(front_len(msg) > msg->front_alloc_len); - add_in_kvec(con, msg->front.iov_base, front_len(msg)); msg->front.iov_len = front_len(msg); - - if (con_secure(con) && need_padding(front_len(msg))) - add_in_kvec(con, FRONT_PAD(con->v2.in_buf), - padding_len(front_len(msg))); } else { msg->front.iov_len = 0; } if (middle_len(msg)) { WARN_ON(middle_len(msg) > msg->middle->alloc_len); - add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); msg->middle->vec.iov_len = middle_len(msg); - - if (con_secure(con) && need_padding(middle_len(msg))) - add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf), - padding_len(middle_len(msg))); } else if (msg->middle) { msg->middle->vec.iov_len = 0; } - if (data_len(msg)) { - con->v2.in_state = IN_S_PREPARE_READ_DATA; - } else { - add_in_kvec(con, con->v2.in_buf, - con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN : - CEPH_EPILOGUE_PLAIN_LEN); - con->v2.in_state = IN_S_HANDLE_EPILOGUE; - } + if (!front_len(msg) && !middle_len(msg) && !data_len(msg)) + return process_message(con); + + if (con_secure(con)) + return prepare_read_tail_secure(con); + + prepare_read_tail_plain(con); return 0; } @@ -2717,7 +2788,7 @@ static int handle_epilogue(struct ceph_connection *con) int ret; if (con_secure(con)) { - ret = decrypt_message(con); + ret = decrypt_tail(con); if (ret) { if (ret == -EBADMSG) con->error_msg = "integrity error, bad epilogue auth tag"; @@ -2792,6 +2863,10 @@ static int populate_in_iter(struct ceph_connection *con) prepare_read_data_cont(con); ret = 0; break; + case IN_S_PREPARE_READ_ENC_PAGE: + prepare_read_enc_page(con); + ret = 0; + break; case IN_S_HANDLE_EPILOGUE: ret = handle_epilogue(con); break; @@ -3326,20 +3401,16 @@ void ceph_con_v2_revoke(struct ceph_connection *con) static void revoke_at_prepare_read_data(struct ceph_connection *con) { - int remaining; /* data + [data padding] + epilogue */ + int remaining; int resid; + WARN_ON(con_secure(con)); WARN_ON(!data_len(con->in_msg)); WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); resid = iov_iter_count(&con->v2.in_iter); WARN_ON(!resid); - if (con_secure(con)) - remaining = padded_len(data_len(con->in_msg)) + - CEPH_EPILOGUE_SECURE_LEN; - else - remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN; - + remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN; dout("%s con %p resid %d remaining %d\n", __func__, con, resid, remaining); con->v2.in_iter.count -= resid; @@ -3350,8 +3421,9 @@ static void revoke_at_prepare_read_data(struct ceph_connection *con) static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) { int recved, resid; /* current piece of data */ - int remaining; /* [data padding] + epilogue */ + int remaining; + WARN_ON(con_secure(con)); WARN_ON(!data_len(con->in_msg)); WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); resid = iov_iter_count(&con->v2.in_iter); @@ -3363,12 +3435,7 @@ static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) ceph_msg_data_advance(&con->v2.in_cursor, recved); WARN_ON(resid > con->v2.in_cursor.total_resid); - if (con_secure(con)) - remaining = padding_len(data_len(con->in_msg)) + - CEPH_EPILOGUE_SECURE_LEN; - else - remaining = CEPH_EPILOGUE_PLAIN_LEN; - + remaining = CEPH_EPILOGUE_PLAIN_LEN; dout("%s con %p total_resid %zu remaining %d\n", __func__, con, con->v2.in_cursor.total_resid, remaining); con->v2.in_iter.count -= resid; @@ -3376,11 +3443,26 @@ static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) con->v2.in_state = IN_S_FINISH_SKIP; } +static void revoke_at_prepare_read_enc_page(struct ceph_connection *con) +{ + int resid; /* current enc page (not necessarily data) */ + + WARN_ON(!con_secure(con)); + WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid || resid > con->v2.in_bvec.bv_len); + + dout("%s con %p resid %d enc_resid %d\n", __func__, con, resid, + con->v2.in_enc_resid); + con->v2.in_iter.count -= resid; + set_in_skip(con, resid + con->v2.in_enc_resid); + con->v2.in_state = IN_S_FINISH_SKIP; +} + static void revoke_at_handle_epilogue(struct ceph_connection *con) { int resid; - WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); resid = iov_iter_count(&con->v2.in_iter); WARN_ON(!resid); @@ -3399,6 +3481,9 @@ void ceph_con_v2_revoke_incoming(struct ceph_connection *con) case IN_S_PREPARE_READ_DATA_CONT: revoke_at_prepare_read_data_cont(con); break; + case IN_S_PREPARE_READ_ENC_PAGE: + revoke_at_prepare_read_enc_page(con); + break; case IN_S_HANDLE_EPILOGUE: revoke_at_handle_epilogue(con); break; @@ -3432,6 +3517,13 @@ void ceph_con_v2_reset_protocol(struct ceph_connection *con) clear_out_sign_kvecs(con); free_conn_bufs(con); + if (con->v2.in_enc_pages) { + WARN_ON(!con->v2.in_enc_page_cnt); + ceph_release_page_vector(con->v2.in_enc_pages, + con->v2.in_enc_page_cnt); + con->v2.in_enc_pages = NULL; + con->v2.in_enc_page_cnt = 0; + } if (con->v2.out_enc_pages) { WARN_ON(!con->v2.out_enc_page_cnt); ceph_release_page_vector(con->v2.out_enc_pages, -- cgit v1.2.3 From 038b8d1d1ab1cce11a158d30bf080ff41a2cfd15 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 30 Dec 2021 15:13:32 +0100 Subject: libceph: optionally use bounce buffer on recv path in crc mode Both msgr1 and msgr2 in crc mode are zero copy in the sense that message data is read from the socket directly into the destination buffer. We assume that the destination buffer is stable (i.e. remains unchanged while it is being read to) though. Otherwise, CRC errors ensue: libceph: read_partial_message 0000000048edf8ad data crc 1063286393 != exp. 228122706 libceph: osd1 (1)192.168.122.1:6843 bad crc/signature libceph: bad data crc, calculated 57958023, expected 1805382778 libceph: osd2 (2)192.168.122.1:6876 integrity error, bad crc Introduce rxbounce option to enable use of a bounce buffer when receiving message data. In particular this is needed if a mapped image is a Windows VM disk, passed to QEMU. Windows has a system-wide "dummy" page that may be mapped into the destination buffer (potentially more than once into the same buffer) by the Windows Memory Manager in an effort to generate a single large I/O [1][2]. QEMU makes a point of preserving overlap relationships when cloning I/O vectors, so krbd gets exposed to this behaviour. [1] "What Is Really in That MDL?" https://docs.microsoft.com/en-us/previous-versions/windows/hardware/design/dn614012(v=vs.85) [2] https://blogs.msmvps.com/kernelmustard/2005/05/04/dummy-pages/ URL: https://bugzilla.redhat.com/show_bug.cgi?id=1973317 Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/libceph.h | 1 + include/linux/ceph/messenger.h | 1 + net/ceph/ceph_common.c | 7 +++++ net/ceph/messenger.c | 4 +++ net/ceph/messenger_v1.c | 54 ++++++++++++++++++++++++++++++++++----- net/ceph/messenger_v2.c | 58 ++++++++++++++++++++++++++++++++---------- 6 files changed, 105 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 6a89ea410e43..edf62eaa6285 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -35,6 +35,7 @@ #define CEPH_OPT_TCP_NODELAY (1<<4) /* TCP_NODELAY on TCP sockets */ #define CEPH_OPT_NOMSGSIGN (1<<5) /* don't sign msgs (msgr1) */ #define CEPH_OPT_ABORT_ON_FULL (1<<6) /* abort w/ ENOSPC when full */ +#define CEPH_OPT_RXBOUNCE (1<<7) /* double-buffer read data */ #define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 6c6b6ea52bb8..e7f2fb2fc207 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -461,6 +461,7 @@ struct ceph_connection { struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ + struct page *bounce_page; u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */ diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index ecc400a0b7bb..4c6441536d55 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -246,6 +246,7 @@ enum { Opt_cephx_sign_messages, Opt_tcp_nodelay, Opt_abort_on_full, + Opt_rxbounce, }; enum { @@ -295,6 +296,7 @@ static const struct fs_parameter_spec ceph_parameters[] = { fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout), fsparam_enum ("read_from_replica", Opt_read_from_replica, ceph_param_read_from_replica), + fsparam_flag ("rxbounce", Opt_rxbounce), fsparam_enum ("ms_mode", Opt_ms_mode, ceph_param_ms_mode), fsparam_string ("secret", Opt_secret), @@ -584,6 +586,9 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, case Opt_abort_on_full: opt->flags |= CEPH_OPT_ABORT_ON_FULL; break; + case Opt_rxbounce: + opt->flags |= CEPH_OPT_RXBOUNCE; + break; default: BUG(); @@ -660,6 +665,8 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, seq_puts(m, "notcp_nodelay,"); if (show_all && (opt->flags & CEPH_OPT_ABORT_ON_FULL)) seq_puts(m, "abort_on_full,"); + if (opt->flags & CEPH_OPT_RXBOUNCE) + seq_puts(m, "rxbounce,"); if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) seq_printf(m, "mount_timeout=%d,", diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 45eba2dcb67a..d3bb656308b4 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -515,6 +515,10 @@ static void ceph_con_reset_protocol(struct ceph_connection *con) ceph_msg_put(con->out_msg); con->out_msg = NULL; } + if (con->bounce_page) { + __free_page(con->bounce_page); + con->bounce_page = NULL; + } if (ceph_msgr2(from_msgr(con->msgr))) ceph_con_v2_reset_protocol(con); diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 2cb5ffdf071a..6b014eca3a13 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -992,8 +992,7 @@ static int read_partial_message_section(struct ceph_connection *con, static int read_partial_msg_data(struct ceph_connection *con) { - struct ceph_msg *msg = con->in_msg; - struct ceph_msg_data_cursor *cursor = &msg->cursor; + struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); struct page *page; size_t page_offset; @@ -1001,9 +1000,6 @@ static int read_partial_msg_data(struct ceph_connection *con) u32 crc = 0; int ret; - if (!msg->num_data_items) - return -EIO; - if (do_datacrc) crc = con->in_data_crc; while (cursor->total_resid) { @@ -1031,6 +1027,46 @@ static int read_partial_msg_data(struct ceph_connection *con) return 1; /* must return > 0 to indicate success */ } +static int read_partial_msg_data_bounce(struct ceph_connection *con) +{ + struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; + struct page *page; + size_t off, len; + u32 crc; + int ret; + + if (unlikely(!con->bounce_page)) { + con->bounce_page = alloc_page(GFP_NOIO); + if (!con->bounce_page) { + pr_err("failed to allocate bounce page\n"); + return -ENOMEM; + } + } + + crc = con->in_data_crc; + while (cursor->total_resid) { + if (!cursor->resid) { + ceph_msg_data_advance(cursor, 0); + continue; + } + + page = ceph_msg_data_next(cursor, &off, &len, NULL); + ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len); + if (ret <= 0) { + con->in_data_crc = crc; + return ret; + } + + crc = crc32c(crc, page_address(con->bounce_page), ret); + memcpy_to_page(page, off, page_address(con->bounce_page), ret); + + ceph_msg_data_advance(cursor, ret); + } + con->in_data_crc = crc; + + return 1; /* must return > 0 to indicate success */ +} + /* * read (part of) a message. */ @@ -1141,7 +1177,13 @@ static int read_partial_message(struct ceph_connection *con) /* (page) data */ if (data_len) { - ret = read_partial_msg_data(con); + if (!m->num_data_items) + return -EIO; + + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) + ret = read_partial_msg_data_bounce(con); + else + ret = read_partial_msg_data(con); if (ret <= 0) return ret; } diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 2ea00489e691..c81379f93ad5 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -1753,7 +1753,7 @@ static int prepare_read_control_remainder(struct ceph_connection *con) return 0; } -static void prepare_read_data(struct ceph_connection *con) +static int prepare_read_data(struct ceph_connection *con) { struct bio_vec bv; @@ -1762,23 +1762,55 @@ static void prepare_read_data(struct ceph_connection *con) data_len(con->in_msg)); get_bvec_at(&con->v2.in_cursor, &bv); - set_in_bvec(con, &bv); + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + if (unlikely(!con->bounce_page)) { + con->bounce_page = alloc_page(GFP_NOIO); + if (!con->bounce_page) { + pr_err("failed to allocate bounce page\n"); + return -ENOMEM; + } + } + + bv.bv_page = con->bounce_page; + bv.bv_offset = 0; + set_in_bvec(con, &bv); + } else { + set_in_bvec(con, &bv); + } con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT; + return 0; } static void prepare_read_data_cont(struct ceph_connection *con) { struct bio_vec bv; - con->in_data_crc = ceph_crc32c_page(con->in_data_crc, - con->v2.in_bvec.bv_page, - con->v2.in_bvec.bv_offset, - con->v2.in_bvec.bv_len); + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + con->in_data_crc = crc32c(con->in_data_crc, + page_address(con->bounce_page), + con->v2.in_bvec.bv_len); + + get_bvec_at(&con->v2.in_cursor, &bv); + memcpy_to_page(bv.bv_page, bv.bv_offset, + page_address(con->bounce_page), + con->v2.in_bvec.bv_len); + } else { + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, + con->v2.in_bvec.bv_page, + con->v2.in_bvec.bv_offset, + con->v2.in_bvec.bv_len); + } ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len); if (con->v2.in_cursor.total_resid) { get_bvec_at(&con->v2.in_cursor, &bv); - set_in_bvec(con, &bv); + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + bv.bv_page = con->bounce_page; + bv.bv_offset = 0; + set_in_bvec(con, &bv); + } else { + set_in_bvec(con, &bv); + } WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT); return; } @@ -1791,14 +1823,13 @@ static void prepare_read_data_cont(struct ceph_connection *con) con->v2.in_state = IN_S_HANDLE_EPILOGUE; } -static void prepare_read_tail_plain(struct ceph_connection *con) +static int prepare_read_tail_plain(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; if (!front_len(msg) && !middle_len(msg)) { WARN_ON(!data_len(msg)); - prepare_read_data(con); - return; + return prepare_read_data(con); } reset_in_kvecs(con); @@ -1817,6 +1848,7 @@ static void prepare_read_tail_plain(struct ceph_connection *con) add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); con->v2.in_state = IN_S_HANDLE_EPILOGUE; } + return 0; } static void prepare_read_enc_page(struct ceph_connection *con) @@ -2699,8 +2731,7 @@ static int __handle_control(struct ceph_connection *con, void *p) if (con_secure(con)) return prepare_read_tail_secure(con); - prepare_read_tail_plain(con); - return 0; + return prepare_read_tail_plain(con); } static int handle_preamble(struct ceph_connection *con) @@ -2856,8 +2887,7 @@ static int populate_in_iter(struct ceph_connection *con) ret = handle_control_remainder(con); break; case IN_S_PREPARE_READ_DATA: - prepare_read_data(con); - ret = 0; + ret = prepare_read_data(con); break; case IN_S_PREPARE_READ_DATA_CONT: prepare_read_data_cont(con); -- cgit v1.2.3 From bfb1a7c91fb7758273b4a8d735313d9cc388b502 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 2 Feb 2022 12:55:53 -0800 Subject: x86/bug: Merge annotate_reachable() into _BUG_FLAGS() asm In __WARN_FLAGS(), we had two asm statements (abbreviated): asm volatile("ud2"); asm volatile(".pushsection .discard.reachable"); These pair of statements are used to trigger an exception, but then help objtool understand that for warnings, control flow will be restored immediately afterwards. The problem is that volatile is not a compiler barrier. GCC explicitly documents this: > Note that the compiler can move even volatile asm instructions > relative to other code, including across jump instructions. Also, no clobbers are specified to prevent instructions from subsequent statements from being scheduled by compiler before the second asm statement. This can lead to instructions from subsequent statements being emitted by the compiler before the second asm statement. Providing a scheduling model such as via -march= options enables the compiler to better schedule instructions with known latencies to hide latencies from data hazards compared to inline asm statements in which latencies are not estimated. If an instruction gets scheduled by the compiler between the two asm statements, then objtool will think that it is not reachable, producing a warning. To prevent instructions from being scheduled in between the two asm statements, merge them. Also remove an unnecessary unreachable() asm annotation from BUG() in favor of __builtin_unreachable(). objtool is able to track that the ud2 from BUG() terminates control flow within the function. Link: https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Volatile Link: https://github.com/ClangBuiltLinux/linux/issues/1483 Signed-off-by: Nick Desaulniers Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20220202205557.2260694-1-ndesaulniers@google.com --- arch/x86/include/asm/bug.h | 20 +++++++++++--------- include/linux/compiler.h | 21 +++++---------------- 2 files changed, 16 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 84b87538a15d..bab883c0b6fe 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -22,7 +22,7 @@ #ifdef CONFIG_DEBUG_BUGVERBOSE -#define _BUG_FLAGS(ins, flags) \ +#define _BUG_FLAGS(ins, flags, extra) \ do { \ asm_inline volatile("1:\t" ins "\n" \ ".pushsection __bug_table,\"aw\"\n" \ @@ -31,7 +31,8 @@ do { \ "\t.word %c1" "\t# bug_entry::line\n" \ "\t.word %c2" "\t# bug_entry::flags\n" \ "\t.org 2b+%c3\n" \ - ".popsection" \ + ".popsection\n" \ + extra \ : : "i" (__FILE__), "i" (__LINE__), \ "i" (flags), \ "i" (sizeof(struct bug_entry))); \ @@ -39,14 +40,15 @@ do { \ #else /* !CONFIG_DEBUG_BUGVERBOSE */ -#define _BUG_FLAGS(ins, flags) \ +#define _BUG_FLAGS(ins, flags, extra) \ do { \ asm_inline volatile("1:\t" ins "\n" \ ".pushsection __bug_table,\"aw\"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t.word %c0" "\t# bug_entry::flags\n" \ "\t.org 2b+%c1\n" \ - ".popsection" \ + ".popsection\n" \ + extra \ : : "i" (flags), \ "i" (sizeof(struct bug_entry))); \ } while (0) @@ -55,7 +57,7 @@ do { \ #else -#define _BUG_FLAGS(ins, flags) asm volatile(ins) +#define _BUG_FLAGS(ins, flags, extra) asm volatile(ins) #endif /* CONFIG_GENERIC_BUG */ @@ -63,8 +65,8 @@ do { \ #define BUG() \ do { \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, 0); \ - unreachable(); \ + _BUG_FLAGS(ASM_UD2, 0, ""); \ + __builtin_unreachable(); \ } while (0) /* @@ -75,9 +77,9 @@ do { \ */ #define __WARN_FLAGS(flags) \ do { \ + __auto_type f = BUGFLAG_WARNING|(flags); \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, BUGFLAG_WARNING|(flags)); \ - annotate_reachable(); \ + _BUG_FLAGS(ASM_UD2, f, ASM_REACHABLE); \ instrumentation_end(); \ } while (0) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 429dcebe2b99..0f7fd205ab7e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -117,14 +117,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, */ #define __stringify_label(n) #n -#define __annotate_reachable(c) ({ \ - asm volatile(__stringify_label(c) ":\n\t" \ - ".pushsection .discard.reachable\n\t" \ - ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ -}) -#define annotate_reachable() __annotate_reachable(__COUNTER__) - #define __annotate_unreachable(c) ({ \ asm volatile(__stringify_label(c) ":\n\t" \ ".pushsection .discard.unreachable\n\t" \ @@ -133,24 +125,21 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, }) #define annotate_unreachable() __annotate_unreachable(__COUNTER__) -#define ASM_UNREACHABLE \ - "999:\n\t" \ - ".pushsection .discard.unreachable\n\t" \ - ".long 999b - .\n\t" \ +#define ASM_REACHABLE \ + "998:\n\t" \ + ".pushsection .discard.reachable\n\t" \ + ".long 998b - .\n\t" \ ".popsection\n\t" /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(".rodata..c_jump_table") #else -#define annotate_reachable() #define annotate_unreachable() +# define ASM_REACHABLE #define __annotate_jump_table #endif -#ifndef ASM_UNREACHABLE -# define ASM_UNREACHABLE -#endif #ifndef unreachable # define unreachable() do { \ annotate_unreachable(); \ -- cgit v1.2.3 From 4a81f6da9cb2d1ef911131a6fd8bd15cb61fc772 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 1 Feb 2022 20:39:42 +0100 Subject: net, neigh: Do not trigger immediate probes on NUD_FAILED from neigh_managed_work syzkaller was able to trigger a deadlock for NTF_MANAGED entries [0]: kworker/0:16/14617 is trying to acquire lock: ffffffff8d4dd370 (&tbl->lock){++-.}-{2:2}, at: ___neigh_create+0x9e1/0x2990 net/core/neighbour.c:652 [...] but task is already holding lock: ffffffff8d4dd370 (&tbl->lock){++-.}-{2:2}, at: neigh_managed_work+0x35/0x250 net/core/neighbour.c:1572 The neighbor entry turned to NUD_FAILED state, where __neigh_event_send() triggered an immediate probe as per commit cd28ca0a3dd1 ("neigh: reduce arp latency") via neigh_probe() given table lock was held. One option to fix this situation is to defer the neigh_probe() back to the neigh_timer_handler() similarly as pre cd28ca0a3dd1. For the case of NTF_MANAGED, this deferral is acceptable given this only happens on actual failure state and regular / expected state is NUD_VALID with the entry already present. The fix adds a parameter to __neigh_event_send() in order to communicate whether immediate probe is allowed or disallowed. Existing call-sites of neigh_event_send() default as-is to immediate probe. However, the neigh_managed_work() disables it via use of neigh_event_send_probe(). [0] __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_deadlock_bug kernel/locking/lockdep.c:2956 [inline] check_deadlock kernel/locking/lockdep.c:2999 [inline] validate_chain kernel/locking/lockdep.c:3788 [inline] __lock_acquire.cold+0x149/0x3ab kernel/locking/lockdep.c:5027 lock_acquire kernel/locking/lockdep.c:5639 [inline] lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5604 __raw_write_lock_bh include/linux/rwlock_api_smp.h:202 [inline] _raw_write_lock_bh+0x2f/0x40 kernel/locking/spinlock.c:334 ___neigh_create+0x9e1/0x2990 net/core/neighbour.c:652 ip6_finish_output2+0x1070/0x14f0 net/ipv6/ip6_output.c:123 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] __ip6_finish_output+0x61e/0xe90 net/ipv6/ip6_output.c:170 ip6_finish_output+0x32/0x200 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x1e4/0x530 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:451 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0xa99/0x17f0 net/ipv6/ndisc.c:508 ndisc_send_ns+0x3a9/0x840 net/ipv6/ndisc.c:650 ndisc_solicit+0x2cd/0x4f0 net/ipv6/ndisc.c:742 neigh_probe+0xc2/0x110 net/core/neighbour.c:1040 __neigh_event_send+0x37d/0x1570 net/core/neighbour.c:1201 neigh_event_send include/net/neighbour.h:470 [inline] neigh_managed_work+0x162/0x250 net/core/neighbour.c:1574 process_one_work+0x9ac/0x1650 kernel/workqueue.c:2307 worker_thread+0x657/0x1110 kernel/workqueue.c:2454 kthread+0x2e9/0x3a0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Fixes: 7482e3841d52 ("net, neigh: Add NTF_MANAGED flag for managed neighbor entries") Reported-by: syzbot+5239d0e1778a500d477a@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Roopa Prabhu Tested-by: syzbot+5239d0e1778a500d477a@syzkaller.appspotmail.com Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220201193942.5055-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 18 +++++++++++++----- net/core/neighbour.c | 18 ++++++++++++------ 2 files changed, 25 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 937389e04c8e..87419f7f5421 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -350,7 +350,8 @@ static inline struct neighbour *neigh_create(struct neigh_table *tbl, return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); @@ -460,17 +461,24 @@ static inline struct neighbour * neigh_clone(struct neighbour *neigh) #define neigh_hold(n) refcount_inc(&(n)->refcnt) -static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +static __always_inline int neigh_event_send_probe(struct neighbour *neigh, + struct sk_buff *skb, + const bool immediate_ok) { unsigned long now = jiffies; - + if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); - if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) - return __neigh_event_send(neigh, skb); + if (!(neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))) + return __neigh_event_send(neigh, skb, immediate_ok); return 0; } +static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + return neigh_event_send_probe(neigh, skb, true); +} + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6c2016f7f3d1..ec0bf737b076 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1133,7 +1133,8 @@ out: neigh_release(neigh); } -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok) { int rc; bool immediate_probe = false; @@ -1154,12 +1155,17 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); - neigh->nud_state = NUD_INCOMPLETE; + neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; - next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), - HZ/100); + if (!immediate_ok) { + next = now + 1; + } else { + immediate_probe = true; + next = now + max(NEIGH_VAR(neigh->parms, + RETRANS_TIME), + HZ / 100); + } neigh_add_timer(neigh, next); - immediate_probe = true; } else { neigh->nud_state = NUD_FAILED; neigh->updated = jiffies; @@ -1571,7 +1577,7 @@ static void neigh_managed_work(struct work_struct *work) write_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) - neigh_event_send(neigh, NULL); + neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME)); write_unlock_bh(&tbl->lock); -- cgit v1.2.3 From 4564661af6ee321942ec1ab012191d7adedd3e00 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 30 Jan 2022 11:17:05 -0800 Subject: xen: xenbus_dev.h: delete incorrect file name It is better/preferred not to include file names in source files because (a) they are not needed and (b) they can be incorrect, so just delete this incorrect file name. Signed-off-by: Randy Dunlap Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: xen-devel@lists.xenproject.org Link: https://lore.kernel.org/r/20220130191705.24971-1-rdunlap@infradead.org Signed-off-by: Juergen Gross --- include/xen/xenbus_dev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/xen/xenbus_dev.h b/include/xen/xenbus_dev.h index bbee8c6a349d..4dc45a51c044 100644 --- a/include/xen/xenbus_dev.h +++ b/include/xen/xenbus_dev.h @@ -1,6 +1,4 @@ /****************************************************************************** - * evtchn.h - * * Interface to /dev/xen/xenbus_backend. * * Copyright (c) 2011 Bastian Blank -- cgit v1.2.3 From 164666fa66669d437bdcc8d5f1744a2aee73be41 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Mon, 31 Jan 2022 12:23:07 -0500 Subject: Improve docs for IOCTL_GNTDEV_MAP_GRANT_REF --------------cKY3Ggs6VDUCSn4I6iN78sHA Content-Type: multipart/mixed; boundary="------------g0T69ASidFiPhh4eOY4XzIg1" --------------g0T69ASidFiPhh4eOY4XzIg1 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: quoted-printable The current implementation of gntdev guarantees that the first call to IOCTL_GNTDEV_MAP_GRANT_REF will set @index to 0. This is required to use gntdev for Wayland, which is a future desire of Qubes OS. Additionally, requesting zero grants results in an error, but this was not documented either. Document both of these. Signed-off-by: Demi Marie Obenour Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/f66c5a4e-2034-00b5-a635-6983bd999c07@gmail.com Signed-off-by: Juergen Gross --- include/uapi/xen/gntdev.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/xen/gntdev.h b/include/uapi/xen/gntdev.h index 9ac5515b9bc2..7a7145395c09 100644 --- a/include/uapi/xen/gntdev.h +++ b/include/uapi/xen/gntdev.h @@ -47,7 +47,13 @@ struct ioctl_gntdev_grant_ref { /* * Inserts the grant references into the mapping table of an instance * of gntdev. N.B. This does not perform the mapping, which is deferred - * until mmap() is called with @index as the offset. + * until mmap() is called with @index as the offset. @index should be + * considered opaque to userspace, with one exception: if no grant + * references have ever been inserted into the mapping table of this + * instance, @index will be set to 0. This is necessary to use gntdev + * with userspace APIs that expect a file descriptor that can be + * mmap()'d at offset 0, such as Wayland. If @count is set to 0, this + * ioctl will fail. */ #define IOCTL_GNTDEV_MAP_GRANT_REF \ _IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) -- cgit v1.2.3 From e85c81ba8859a4c839bcd69c5d83b32954133a5b Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 17 Jan 2022 17:36:54 +0800 Subject: ext4: fast commit may not fallback for ineligible commit For the follow scenario: 1. jbd start commit transaction n 2. task A get new handle for transaction n+1 3. task A do some ineligible actions and mark FC_INELIGIBLE 4. jbd complete transaction n and clean FC_INELIGIBLE 5. task A call fsync In this case fast commit will not fallback to full commit and transaction n+1 also not handled by jbd. Make ext4_fc_mark_ineligible() also record transaction tid for latest ineligible case, when call ext4_fc_cleanup() check current transaction tid, if small than latest ineligible tid do not clear the EXT4_MF_FC_INELIGIBLE. Reported-by: kernel test robot Reported-by: Dan Carpenter Reported-by: Ritesh Harjani Suggested-by: Harshad Shirwadkar Signed-off-by: Xin Yin Link: https://lore.kernel.org/r/20220117093655.35160-2-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 3 ++- fs/ext4/extents.c | 4 ++-- fs/ext4/fast_commit.c | 33 +++++++++++++++++++++++++-------- fs/ext4/inode.c | 4 ++-- fs/ext4/ioctl.c | 4 ++-- fs/ext4/namei.c | 4 ++-- fs/ext4/super.c | 1 + fs/ext4/xattr.c | 6 +++--- fs/jbd2/commit.c | 2 +- fs/jbd2/journal.c | 2 +- include/linux/jbd2.h | 2 +- 11 files changed, 42 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 598ecf07652a..d52295becda3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1749,6 +1749,7 @@ struct ext4_sb_info { spinlock_t s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; + tid_t s_fc_ineligible_tid; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif @@ -2925,7 +2926,7 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); -void ext4_fc_mark_ineligible(struct super_block *sb, int reason); +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); void ext4_fc_start_update(struct inode *inode); void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5dd13108d4b7..3ce4fc250b0d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5336,7 +5336,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); @@ -5476,7 +5476,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 1abe78b8d84f..e031afee42de 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -300,18 +300,32 @@ restart: } /* - * Mark file system as fast commit ineligible. This means that next commit - * operation would result in a full jbd2 commit. + * Mark file system as fast commit ineligible, and record latest + * ineligible transaction tid. This means until the recorded + * transaction, commit operation would result in a full jbd2 commit. */ -void ext4_fc_mark_ineligible(struct super_block *sb, int reason) +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); + tid_t tid; if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (handle && !IS_ERR(handle)) + tid = handle->h_transaction->t_tid; + else { + read_lock(&sbi->s_journal->j_state_lock); + tid = sbi->s_journal->j_running_transaction ? + sbi->s_journal->j_running_transaction->t_tid : 0; + read_unlock(&sbi->s_journal->j_state_lock); + } + spin_lock(&sbi->s_fc_lock); + if (sbi->s_fc_ineligible_tid < tid) + sbi->s_fc_ineligible_tid = tid; + spin_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -387,7 +401,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) mutex_unlock(&ei->i_fc_lock); node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -400,7 +414,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_NOMEM); + EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -502,7 +516,7 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_INODE_JOURNAL_DATA); + EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } @@ -1179,7 +1193,7 @@ fallback: * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ -static void ext4_fc_cleanup(journal_t *journal, int full) +static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1227,7 +1241,10 @@ static void ext4_fc_cleanup(journal_t *journal, int full) &sbi->s_fc_q[FC_Q_MAIN]); ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); - ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (tid >= sbi->s_fc_ineligible_tid) { + sbi->s_fc_ineligible_tid = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + } if (full) sbi->s_fc_bytes = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9dbeb772de60..f368dd5fd8d5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -337,7 +337,7 @@ stop_handle: return; no_delete: if (!list_empty(&EXT4_I(inode)->i_fc_list)) - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } @@ -5976,7 +5976,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return PTR_ERR(handle); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_JOURNAL_FLAG_CHANGE); + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bbbedf27b71c..a8022c2c6a58 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -411,7 +411,7 @@ static long swap_inode_boot_loader(struct super_block *sb, err = -EINVAL; goto err_out; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); @@ -1373,7 +1373,7 @@ mext_out: err = ext4_resize_fs(sb, n_blocks_count); if (EXT4_SB(sb)->s_journal) { - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 52c9bd154122..47b9f87dbc6f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3889,7 +3889,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, * dirents in directories. */ ext4_fc_mark_ineligible(old.inode->i_sb, - EXT4_FC_REASON_RENAME_DIR); + EXT4_FC_REASON_RENAME_DIR, handle); } else { if (new.inode) ext4_fc_track_unlink(handle, new.dentry); @@ -4049,7 +4049,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(retval)) goto end_rename; ext4_fc_mark_ineligible(new.inode->i_sb, - EXT4_FC_REASON_CROSS_RENAME); + EXT4_FC_REASON_CROSS_RENAME, handle); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9a936ecbaa3b..6930b7737ce4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5084,6 +5084,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); + sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1e0fc1ed845b..042325349098 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2408,7 +2408,7 @@ retry_inode: if (IS_SYNC(inode)) ext4_handle_sync(handle); } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); cleanup: brelse(is.iloc.bh); @@ -2486,7 +2486,7 @@ retry: if (error == 0) error = error2; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL); return error; } @@ -2920,7 +2920,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, error); goto cleanup; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); } error = 0; cleanup: diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3cc4ab2ba7f4..d188fa913a07 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1170,7 +1170,7 @@ restart_loop: if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); if (journal->j_fc_cleanup_callback) - journal->j_fc_cleanup_callback(journal, 1); + journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD2: commit %d complete, head %d\n", diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0b86a4365b66..a8e64ad11ae3 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -771,7 +771,7 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { jbd2_journal_unlock_updates(journal); if (journal->j_fc_cleanup_callback) - journal->j_fc_cleanup_callback(journal, 0); + journal->j_fc_cleanup_callback(journal, 0, tid); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index fd933c45281a..d63b8106796e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1295,7 +1295,7 @@ struct journal_s * Clean-up after fast commit or full commit. JBD2 calls this function * after every commit operation. */ - void (*j_fc_cleanup_callback)(struct journal_s *journal, int); + void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); /** * @j_fc_replay_callback: -- cgit v1.2.3 From 3ca40c0d329113a9f76f6aa01abe73d9f16ace9d Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:50 +0530 Subject: jbd2: cleanup unused functions declarations from jbd2.h During code review found no references of few of these below function declarations. This patch cleans those up from jbd2.h Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/30d1fc327becda197a4136cf9cdc73d9baa3b7b9.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d63b8106796e..afc5572e7b8a 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1419,9 +1419,7 @@ extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *); extern bool __jbd2_journal_refile_buffer(struct journal_head *); extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); -extern void __journal_free_buffer(struct journal_head *bh); extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); -extern void __journal_clean_data_list(transaction_t *transaction); static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) { list_add_tail(&bh->b_assoc_buffers, head); @@ -1486,9 +1484,6 @@ extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct buffer_head **bh_out, sector_t blocknr); -/* Transaction locking */ -extern void __wait_on_journal (journal_t *); - /* Transaction cache support */ extern void jbd2_journal_destroy_transaction_cache(void); extern int __init jbd2_journal_init_transaction_cache(void); @@ -1774,8 +1769,6 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal) #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 -extern int jbd_blocks_per_page(struct inode *inode); - /* JBD uses a CRC32 checksum */ #define JBD_MAX_CHECKSUM_SIZE 4 -- cgit v1.2.3 From 4f98186848707f530669238d90e0562d92a78aab Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 17 Jan 2022 17:41:51 +0530 Subject: jbd2: refactor wait logic for transaction updates into a common function No functionality change as such in this patch. This only refactors the common piece of code which waits for t_updates to finish into a common function named as jbd2_journal_wait_updates(journal_t *) Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/8c564f70f4b2591171677a2a74fccb22a7b6c3a4.1642416995.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 19 +++--------------- fs/jbd2/transaction.c | 53 +++++++++++++++++++++++++++++++-------------------- include/linux/jbd2.h | 4 +++- 3 files changed, 38 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index d188fa913a07..5b9408e3b370 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -484,22 +484,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, stats.run.rs_locked); - spin_lock(&commit_transaction->t_handle_lock); - while (atomic_read(&commit_transaction->t_updates)) { - DEFINE_WAIT(wait); + // waits for any t_updates to finish + jbd2_journal_wait_updates(journal); - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&commit_transaction->t_updates)) { - spin_unlock(&commit_transaction->t_handle_lock); - write_unlock(&journal->j_state_lock); - schedule(); - write_lock(&journal->j_state_lock); - spin_lock(&commit_transaction->t_handle_lock); - } - finish_wait(&journal->j_wait_updates, &wait); - } - spin_unlock(&commit_transaction->t_handle_lock); commit_transaction->t_state = T_SWITCH; write_unlock(&journal->j_state_lock); @@ -817,7 +804,7 @@ start_journal_io: commit_transaction->t_state = T_COMMIT_DFLUSH; write_unlock(&journal->j_state_lock); - /* + /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue * the commit record diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6a3caedd2285..8e2f8275a253 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -449,7 +449,7 @@ repeat: } /* OK, account for the buffers that this operation expects to - * use and add the handle to the running transaction. + * use and add the handle to the running transaction. */ update_t_max_wait(transaction, ts); handle->h_transaction = transaction; @@ -836,6 +836,35 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) } EXPORT_SYMBOL(jbd2_journal_restart); +/* + * Waits for any outstanding t_updates to finish. + * This is called with write j_state_lock held. + */ +void jbd2_journal_wait_updates(journal_t *journal) +{ + transaction_t *commit_transaction = journal->j_running_transaction; + + if (!commit_transaction) + return; + + spin_lock(&commit_transaction->t_handle_lock); + while (atomic_read(&commit_transaction->t_updates)) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&commit_transaction->t_updates)) { + spin_unlock(&commit_transaction->t_handle_lock); + write_unlock(&journal->j_state_lock); + schedule(); + write_lock(&journal->j_state_lock); + spin_lock(&commit_transaction->t_handle_lock); + } + finish_wait(&journal->j_wait_updates, &wait); + } + spin_unlock(&commit_transaction->t_handle_lock); +} + /** * jbd2_journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. @@ -863,27 +892,9 @@ void jbd2_journal_lock_updates(journal_t *journal) write_lock(&journal->j_state_lock); } - /* Wait until there are no running updates */ - while (1) { - transaction_t *transaction = journal->j_running_transaction; - - if (!transaction) - break; + /* Wait until there are no running t_updates */ + jbd2_journal_wait_updates(journal); - spin_lock(&transaction->t_handle_lock); - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - if (!atomic_read(&transaction->t_updates)) { - spin_unlock(&transaction->t_handle_lock); - finish_wait(&journal->j_wait_updates, &wait); - break; - } - spin_unlock(&transaction->t_handle_lock); - write_unlock(&journal->j_state_lock); - schedule(); - finish_wait(&journal->j_wait_updates, &wait); - write_lock(&journal->j_state_lock); - } write_unlock(&journal->j_state_lock); /* diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index afc5572e7b8a..9c3ada74ffb1 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -594,7 +594,7 @@ struct transaction_s */ unsigned long t_log_start; - /* + /* * Number of buffers on the t_buffers list [j_list_lock, no locks * needed for jbd2 thread] */ @@ -1538,6 +1538,8 @@ extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); extern void jbd2_journal_unlock_updates (journal_t *); +void jbd2_journal_wait_updates(journal_t *); + extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int bsize); -- cgit v1.2.3 From 67d6212afda218d564890d1674bab28e8612170f Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Thu, 27 Jan 2022 15:39:53 -0800 Subject: Revert "module, async: async_synchronize_full() on module init iff async is used" This reverts commit 774a1221e862b343388347bac9b318767336b20b. We need to finish all async code before the module init sequence is done. In the reverted commit the PF_USED_ASYNC flag was added to mark a thread that called async_schedule(). Then the PF_USED_ASYNC flag was used to determine whether or not async_synchronize_full() needs to be invoked. This works when modprobe thread is calling async_schedule(), but it does not work if module dispatches init code to a worker thread which then calls async_schedule(). For example, PCI driver probing is invoked from a worker thread based on a node where device is attached: if (cpu < nr_cpu_ids) error = work_on_cpu(cpu, local_pci_probe, &ddi); else error = local_pci_probe(&ddi); We end up in a situation where a worker thread gets the PF_USED_ASYNC flag set instead of the modprobe thread. As a result, async_synchronize_full() is not invoked and modprobe completes without waiting for the async code to finish. The issue was discovered while loading the pm80xx driver: (scsi_mod.scan=async) modprobe pm80xx worker ... do_init_module() ... pci_call_probe() work_on_cpu(local_pci_probe) local_pci_probe() pm8001_pci_probe() scsi_scan_host() async_schedule() worker->flags |= PF_USED_ASYNC; ... < return from worker > ... if (current->flags & PF_USED_ASYNC) <--- false async_synchronize_full(); Commit 21c3c5d28007 ("block: don't request module during elevator init") fixed the deadlock issue which the reverted commit 774a1221e862 ("module, async: async_synchronize_full() on module init iff async is used") tried to fix. Since commit 0fdff3ec6d87 ("async, kmod: warn on synchronous request_module() from async workers") synchronous module loading from async is not allowed. Given that the original deadlock issue is fixed and it is no longer allowed to call synchronous request_module() from async we can remove PF_USED_ASYNC flag to make module init consistently invoke async_synchronize_full() unless async module probe is requested. Signed-off-by: Igor Pylypiv Reviewed-by: Changyuan Lyu Reviewed-by: Luis Chamberlain Acked-by: Tejun Heo Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/async.c | 3 --- kernel/module.c | 25 +++++-------------------- 3 files changed, 5 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f5b2be39a78c..75ba8aa60248 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1680,7 +1680,6 @@ extern struct pid *cad_pid; #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ -#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ diff --git a/kernel/async.c b/kernel/async.c index b8d7a663497f..b2c4ba5686ee 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -205,9 +205,6 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data, atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* mark that this task has queued an async job, used by module init */ - current->flags |= PF_USED_ASYNC; - /* schedule for execution */ queue_work_node(node, system_unbound_wq, &entry->work); diff --git a/kernel/module.c b/kernel/module.c index 24dab046e16c..46a5c2ed1928 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3725,12 +3725,6 @@ static noinline int do_init_module(struct module *mod) } freeinit->module_init = mod->init_layout.base; - /* - * We want to find out whether @mod uses async during init. Clear - * PF_USED_ASYNC. async_schedule*() will set it. - */ - current->flags &= ~PF_USED_ASYNC; - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3756,22 +3750,13 @@ static noinline int do_init_module(struct module *mod) /* * We need to finish all async code before the module init sequence - * is done. This has potential to deadlock. For example, a newly - * detected block device can trigger request_module() of the - * default iosched from async probing task. Once userland helper - * reaches here, async_synchronize_full() will wait on the async - * task waiting on request_module() and deadlock. - * - * This deadlock is avoided by perfomring async_synchronize_full() - * iff module init queued any async jobs. This isn't a full - * solution as it will deadlock the same if module loading from - * async jobs nests more than once; however, due to the various - * constraints, this hack seems to be the best option for now. - * Please refer to the following thread for details. + * is done. This has potential to deadlock if synchronous module + * loading is requested from async (which is not allowed!). * - * http://thread.gmane.org/gmane.linux.kernel/1420814 + * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous + * request_module() from async workers") for more details. */ - if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) + if (!mod->async_probe_requested) async_synchronize_full(); ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + -- cgit v1.2.3 From 87563a043cef044fed5db7967a75741cc16ad2b1 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Thu, 3 Feb 2022 23:08:11 +0800 Subject: ax25: fix reference count leaks of ax25_dev The previous commit d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") introduces refcount into ax25_dev, but there are reference leak paths in ax25_ctl_ioctl(), ax25_fwd_ioctl(), ax25_rt_add(), ax25_rt_del() and ax25_rt_opt(). This patch uses ax25_dev_put() and adjusts the position of ax25_addr_ax25dev() to fix reference cout leaks of ax25_dev. Fixes: d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") Signed-off-by: Duoming Zhou Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20220203150811.42256-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski --- include/net/ax25.h | 8 +++++--- net/ax25/af_ax25.c | 12 ++++++++---- net/ax25/ax25_dev.c | 24 +++++++++++++++++------- net/ax25/ax25_route.c | 16 +++++++++++----- 4 files changed, 41 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/ax25.h b/include/net/ax25.h index 50b417df6221..8221af1811df 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -294,10 +294,12 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } -#define ax25_dev_hold(__ax25_dev) \ - refcount_inc(&((__ax25_dev)->refcount)) +static inline void ax25_dev_hold(ax25_dev *ax25_dev) +{ + refcount_inc(&ax25_dev->refcount); +} -static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +static inline void ax25_dev_put(ax25_dev *ax25_dev) { if (refcount_dec_and_test(&ax25_dev->refcount)) { kfree(ax25_dev); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 32f61978ff29..3e49d28824ed 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -359,21 +359,25 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl))) return -EFAULT; - if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL) - return -ENODEV; - if (ax25_ctl.digi_count > AX25_MAX_DIGIS) return -EINVAL; if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr); + if (!ax25_dev) + return -ENODEV; + digi.ndigi = ax25_ctl.digi_count; for (k = 0; k < digi.ndigi; k++) digi.calls[k] = ax25_ctl.digi_addr[k]; - if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL) + ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev); + if (!ax25) { + ax25_dev_put(ax25_dev); return -ENOTCONN; + } switch (ax25_ctl.cmd) { case AX25_KILL: diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 770b787fb7bb..d2a244e1c260 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -85,8 +85,8 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; - ax25_dev_hold(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_hold(ax25_dev); ax25_register_dev_sysctl(ax25_dev); } @@ -115,8 +115,8 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; - ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put_track(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); @@ -126,8 +126,8 @@ void ax25_dev_device_down(struct net_device *dev) while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; - ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put_track(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); @@ -150,25 +150,35 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) switch (cmd) { case SIOCAX25ADDFWD: - if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL) + fwd_dev = ax25_addr_ax25dev(&fwd->port_to); + if (!fwd_dev) { + ax25_dev_put(ax25_dev); return -EINVAL; - if (ax25_dev->forward != NULL) + } + if (ax25_dev->forward) { + ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = fwd_dev->dev; ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); break; case SIOCAX25DELFWD: - if (ax25_dev->forward == NULL) + if (!ax25_dev->forward) { + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = NULL; + ax25_dev_put(ax25_dev); break; default: + ax25_dev_put(ax25_dev); return -EINVAL; } - ax25_dev_put(ax25_dev); return 0; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 1e32693833e5..9751207f7757 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -75,11 +75,13 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_dev *ax25_dev; int i; - if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL) - return -EINVAL; if (route->digi_count > AX25_MAX_DIGIS) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&route->port_addr); + if (!ax25_dev) + return -EINVAL; + write_lock_bh(&ax25_route_lock); ax25_rt = ax25_route_list; @@ -91,6 +93,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -101,6 +104,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) } } write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } ax25_rt = ax25_rt->next; @@ -108,6 +112,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } @@ -116,11 +121,11 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; - ax25_dev_put(ax25_dev); if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); kfree(ax25_rt); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -133,6 +138,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->next = ax25_route_list; ax25_route_list = ax25_rt; write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -173,8 +179,8 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } } - ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -216,8 +222,8 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) } out: - ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return err; } -- cgit v1.2.3 From d1ca60efc53d665cf89ed847a14a510a81770b81 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 2 Feb 2022 12:00:56 +0100 Subject: netfilter: ctnetlink: disable helper autoassign When userspace, e.g. conntrackd, inserts an entry with a specified helper, its possible that the helper is lost immediately after its added: ctnetlink_create_conntrack -> nf_ct_helper_ext_add + assign helper -> ctnetlink_setup_nat -> ctnetlink_parse_nat_setup -> parse_nat_setup -> nfnetlink_parse_nat_setup -> nf_nat_setup_info -> nf_conntrack_alter_reply -> __nf_ct_try_assign_helper ... and __nf_ct_try_assign_helper will zero the helper again. Set IPS_HELPER bit to bypass auto-assign logic, its unwanted, just like when helper is assigned via ruleset. Dropped old 'not strictly necessary' comment, it referred to use of rcu_assign_pointer() before it got replaced by RCU_INIT_POINTER(). NB: Fixes tag intentionally incorrect, this extends the referenced commit, but this change won't build without IPS_HELPER introduced there. Fixes: 6714cf5465d280 ("netfilter: nf_conntrack: fix explicit helper attachment and NAT") Reported-by: Pham Thanh Tuyen Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 2 +- net/netfilter/nf_conntrack_netlink.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 4b3395082d15..26071021e986 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -106,7 +106,7 @@ enum ip_conntrack_status { IPS_NAT_CLASH = IPS_UNTRACKED, #endif - /* Conntrack got a helper explicitly attached via CT target. */ + /* Conntrack got a helper explicitly attached (ruleset, ctnetlink). */ IPS_HELPER_BIT = 13, IPS_HELPER = (1 << IPS_HELPER_BIT), diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index ac438370f94a..7032402ffd33 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2311,7 +2311,8 @@ ctnetlink_create_conntrack(struct net *net, if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); - /* not in hash table yet so not strictly necessary */ + /* disable helper auto-assignment for this entry */ + ct->status |= IPS_HELPER; RCU_INIT_POINTER(help->helper, helper); } } else { -- cgit v1.2.3 From ac9f0c810684a1b161c18eb4b91ce84cbc13c91d Mon Sep 17 00:00:00 2001 From: Anton Lundin Date: Thu, 3 Feb 2022 10:41:35 +0100 Subject: ata: libata-core: Introduce ATA_HORKAGE_NO_LOG_DIR horkage 06f6c4c6c3e8 ("ata: libata: add missing ata_identify_page_supported() calls") introduced additional calls to ata_identify_page_supported(), thus also adding indirectly accesses to the device log directory log page through ata_log_supported(). Reading this log page causes SATADOM-ML 3ME devices to lock up. Introduce the horkage flag ATA_HORKAGE_NO_LOG_DIR to prevent accesses to the log directory in ata_log_supported() and add a blacklist entry with this flag for "SATADOM-ML 3ME" devices. Fixes: 636f6e2af4fb ("libata: add horkage for missing Identify Device log") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Anton Lundin Signed-off-by: Damien Le Moal --- drivers/ata/libata-core.c | 10 ++++++++++ include/linux/libata.h | 1 + 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 67f88027680a..e1b1dd215267 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2007,6 +2007,9 @@ static bool ata_log_supported(struct ata_device *dev, u8 log) { struct ata_port *ap = dev->link->ap; + if (dev->horkage & ATA_HORKAGE_NO_LOG_DIR) + return false; + if (ata_read_log_page(dev, ATA_LOG_DIRECTORY, 0, ap->sector_buf, 1)) return false; return get_unaligned_le16(&ap->sector_buf[log * 2]) ? true : false; @@ -4073,6 +4076,13 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "WDC WD3000JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, { "WDC WD3200JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, + /* + * This sata dom device goes on a walkabout when the ATA_LOG_DIRECTORY + * log page is accessed. Ensure we never ask for this log page with + * these devices. + */ + { "SATADOM-ML 3ME", NULL, ATA_HORKAGE_NO_LOG_DIR }, + /* End Marker */ { } }; diff --git a/include/linux/libata.h b/include/linux/libata.h index 605756f645be..7f99b4d78822 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -380,6 +380,7 @@ enum { ATA_HORKAGE_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27), /* Disable NCQ on ATI chipset */ ATA_HORKAGE_NO_ID_DEV_LOG = (1 << 28), /* Identify device log missing */ + ATA_HORKAGE_NO_LOG_DIR = (1 << 29), /* Do not read log directory */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3 From 80110bbfbba6f0078d5a1cbc8df004506db8ffe5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 3 Feb 2022 20:49:24 -0800 Subject: mm/page_table_check: check entries at pmd levels syzbot detected a case where the page table counters were not properly updated. syzkaller login: ------------[ cut here ]------------ kernel BUG at mm/page_table_check.c:162! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 3099 Comm: pasha Not tainted 5.16.0+ #48 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIO4 RIP: 0010:__page_table_check_zero+0x159/0x1a0 Call Trace: free_pcp_prepare+0x3be/0xaa0 free_unref_page+0x1c/0x650 free_compound_page+0xec/0x130 free_transhuge_page+0x1be/0x260 __put_compound_page+0x90/0xd0 release_pages+0x54c/0x1060 __pagevec_release+0x7c/0x110 shmem_undo_range+0x85e/0x1250 ... The repro involved having a huge page that is split due to uprobe event temporarily replacing one of the pages in the huge page. Later the huge page was combined again, but the counters were off, as the PTE level was not properly updated. Make sure that when PMD is cleared and prior to freeing the level the PTEs are updated. Link: https://lkml.kernel.org/r/20220131203249.2832273-5-pasha.tatashin@soleen.com Fixes: df4e817b7108 ("mm: page table check") Signed-off-by: Pasha Tatashin Acked-by: David Rientjes Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Dave Hansen Cc: Greg Thelen Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Wei Xu Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_table_check.h | 19 +++++++++++++++++++ mm/khugepaged.c | 3 +++ mm/page_table_check.c | 20 ++++++++++++++++++++ 3 files changed, 42 insertions(+) (limited to 'include') diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 38cace1da7b6..01e16c7696ec 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -26,6 +26,9 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud); +void __page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd); static inline void page_table_check_alloc(struct page *page, unsigned int order) { @@ -100,6 +103,16 @@ static inline void page_table_check_pud_set(struct mm_struct *mm, __page_table_check_pud_set(mm, addr, pudp, pud); } +static inline void page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_clear_range(mm, addr, pmd); +} + #else static inline void page_table_check_alloc(struct page *page, unsigned int order) @@ -143,5 +156,11 @@ static inline void page_table_check_pud_set(struct mm_struct *mm, { } +static inline void page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ +} + #endif /* CONFIG_PAGE_TABLE_CHECK */ #endif /* __LINUX_PAGE_TABLE_CHECK_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 30e59e4af272..131492fd1148 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -1422,10 +1423,12 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v spinlock_t *ptl; pmd_t pmd; + mmap_assert_write_locked(mm); ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_collapse_flush(vma, addr, pmdp); spin_unlock(ptl); mm_dec_nr_ptes(mm); + page_table_check_pte_clear_range(mm, addr, pmd); pte_free(mm, pmd_pgtable(pmd)); } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index c61d7ebe13b1..3763bd077861 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -247,3 +247,23 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, } } EXPORT_SYMBOL(__page_table_check_pud_set); + +void __page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ + if (&init_mm == mm) + return; + + if (!pmd_bad(pmd) && !pmd_leaf(pmd)) { + pte_t *ptep = pte_offset_map(&pmd, addr); + unsigned long i; + + pte_unmap(ptep); + for (i = 0; i < PTRS_PER_PTE; i++) { + __page_table_check_pte_clear(mm, addr, *ptep); + addr += PAGE_SIZE; + ptep++; + } + } +} -- cgit v1.2.3 From 314c459a6fe0957b5885fbc65c53d51444092880 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 3 Feb 2022 20:49:29 -0800 Subject: mm/pgtable: define pte_index so that preprocessor could recognize it Since commit 974b9b2c68f3 ("mm: consolidate pte_index() and pte_offset_*() definitions") pte_index is a static inline and there is no define for it that can be recognized by the preprocessor. As a result, vm_insert_pages() uses slower loop over vm_insert_page() instead of insert_pages() that amortizes the cost of spinlock operations when inserting multiple pages. Link: https://lkml.kernel.org/r/20220111145457.20748-1-rppt@kernel.org Fixes: 974b9b2c68f3 ("mm: consolidate pte_index() and pte_offset_*() definitions") Signed-off-by: Mike Rapoport Reported-by: Christian Dietrich Reviewed-by: Khalid Aziz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pgtable.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index bc8713a76e03..f4f4077b97aa 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -62,6 +62,7 @@ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } +#define pte_index pte_index #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) -- cgit v1.2.3 From d2a02e3c8bb6b347818518edff5a4b40ff52d6d8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 19 Jan 2022 14:35:06 +0100 Subject: lib/crypto: blake2s: avoid indirect calls to compression function for Clang CFI blake2s_compress_generic is weakly aliased by blake2s_compress. The current harness for function selection uses a function pointer, which is ordinarily inlined and resolved at compile time. But when Clang's CFI is enabled, CFI still triggers when making an indirect call via a weak symbol. This seems like a bug in Clang's CFI, as though it's bucketing weak symbols and strong symbols differently. It also only seems to trigger when "full LTO" mode is used, rather than "thin LTO". [ 0.000000][ T0] Kernel panic - not syncing: CFI failure (target: blake2s_compress_generic+0x0/0x1444) [ 0.000000][ T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.16.0-mainline-06981-g076c855b846e #1 [ 0.000000][ T0] Hardware name: MT6873 (DT) [ 0.000000][ T0] Call trace: [ 0.000000][ T0] dump_backtrace+0xfc/0x1dc [ 0.000000][ T0] dump_stack_lvl+0xa8/0x11c [ 0.000000][ T0] panic+0x194/0x464 [ 0.000000][ T0] __cfi_check_fail+0x54/0x58 [ 0.000000][ T0] __cfi_slowpath_diag+0x354/0x4b0 [ 0.000000][ T0] blake2s_update+0x14c/0x178 [ 0.000000][ T0] _extract_entropy+0xf4/0x29c [ 0.000000][ T0] crng_initialize_primary+0x24/0x94 [ 0.000000][ T0] rand_initialize+0x2c/0x6c [ 0.000000][ T0] start_kernel+0x2f8/0x65c [ 0.000000][ T0] __primary_switched+0xc4/0x7be4 [ 0.000000][ T0] Rebooting in 5 seconds.. Nonetheless, the function pointer method isn't so terrific anyway, so this patch replaces it with a simple boolean, which also gets inlined away. This successfully works around the Clang bug. In general, I'm not too keen on all of the indirection involved here; it clearly does more harm than good. Hopefully the whole thing can get cleaned up down the road when lib/crypto is overhauled more comprehensively. But for now, we go with a simple bandaid. Fixes: 6048fdcc5f26 ("lib/crypto: blake2s: include as built-in") Link: https://github.com/ClangBuiltLinux/linux/issues/1567 Reported-by: Miles Chen Tested-by: Miles Chen Tested-by: Nathan Chancellor Tested-by: John Stultz Acked-by: Nick Desaulniers Reviewed-by: Eric Biggers Signed-off-by: Jason A. Donenfeld --- arch/arm/crypto/blake2s-shash.c | 4 ++-- arch/x86/crypto/blake2s-shash.c | 4 ++-- crypto/blake2s_generic.c | 4 ++-- include/crypto/internal/blake2s.h | 40 ++++++++++++++++++++++++--------------- lib/crypto/blake2s.c | 4 ++-- 5 files changed, 33 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/arch/arm/crypto/blake2s-shash.c b/arch/arm/crypto/blake2s-shash.c index 17c1c3bfe2f5..763c73beea2d 100644 --- a/arch/arm/crypto/blake2s-shash.c +++ b/arch/arm/crypto/blake2s-shash.c @@ -13,12 +13,12 @@ static int crypto_blake2s_update_arm(struct shash_desc *desc, const u8 *in, unsigned int inlen) { - return crypto_blake2s_update(desc, in, inlen, blake2s_compress); + return crypto_blake2s_update(desc, in, inlen, false); } static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out) { - return crypto_blake2s_final(desc, out, blake2s_compress); + return crypto_blake2s_final(desc, out, false); } #define BLAKE2S_ALG(name, driver_name, digest_size) \ diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c index f9e2fecdb761..59ae28abe35c 100644 --- a/arch/x86/crypto/blake2s-shash.c +++ b/arch/x86/crypto/blake2s-shash.c @@ -18,12 +18,12 @@ static int crypto_blake2s_update_x86(struct shash_desc *desc, const u8 *in, unsigned int inlen) { - return crypto_blake2s_update(desc, in, inlen, blake2s_compress); + return crypto_blake2s_update(desc, in, inlen, false); } static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out) { - return crypto_blake2s_final(desc, out, blake2s_compress); + return crypto_blake2s_final(desc, out, false); } #define BLAKE2S_ALG(name, driver_name, digest_size) \ diff --git a/crypto/blake2s_generic.c b/crypto/blake2s_generic.c index 72fe480f9bd6..5f96a21f8788 100644 --- a/crypto/blake2s_generic.c +++ b/crypto/blake2s_generic.c @@ -15,12 +15,12 @@ static int crypto_blake2s_update_generic(struct shash_desc *desc, const u8 *in, unsigned int inlen) { - return crypto_blake2s_update(desc, in, inlen, blake2s_compress_generic); + return crypto_blake2s_update(desc, in, inlen, true); } static int crypto_blake2s_final_generic(struct shash_desc *desc, u8 *out) { - return crypto_blake2s_final(desc, out, blake2s_compress_generic); + return crypto_blake2s_final(desc, out, true); } #define BLAKE2S_ALG(name, driver_name, digest_size) \ diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h index d39cfa0d333e..52363eee2b20 100644 --- a/include/crypto/internal/blake2s.h +++ b/include/crypto/internal/blake2s.h @@ -24,14 +24,11 @@ static inline void blake2s_set_lastblock(struct blake2s_state *state) state->f[0] = -1; } -typedef void (*blake2s_compress_t)(struct blake2s_state *state, - const u8 *block, size_t nblocks, u32 inc); - /* Helper functions for BLAKE2s shared by the library and shash APIs */ -static inline void __blake2s_update(struct blake2s_state *state, - const u8 *in, size_t inlen, - blake2s_compress_t compress) +static __always_inline void +__blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen, + bool force_generic) { const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; @@ -39,7 +36,12 @@ static inline void __blake2s_update(struct blake2s_state *state, return; if (inlen > fill) { memcpy(state->buf + state->buflen, in, fill); - (*compress)(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); + if (force_generic) + blake2s_compress_generic(state, state->buf, 1, + BLAKE2S_BLOCK_SIZE); + else + blake2s_compress(state, state->buf, 1, + BLAKE2S_BLOCK_SIZE); state->buflen = 0; in += fill; inlen -= fill; @@ -47,7 +49,12 @@ static inline void __blake2s_update(struct blake2s_state *state, if (inlen > BLAKE2S_BLOCK_SIZE) { const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); /* Hash one less (full) block than strictly possible */ - (*compress)(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); + if (force_generic) + blake2s_compress_generic(state, in, nblocks - 1, + BLAKE2S_BLOCK_SIZE); + else + blake2s_compress(state, in, nblocks - 1, + BLAKE2S_BLOCK_SIZE); in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); } @@ -55,13 +62,16 @@ static inline void __blake2s_update(struct blake2s_state *state, state->buflen += inlen; } -static inline void __blake2s_final(struct blake2s_state *state, u8 *out, - blake2s_compress_t compress) +static __always_inline void +__blake2s_final(struct blake2s_state *state, u8 *out, bool force_generic) { blake2s_set_lastblock(state); memset(state->buf + state->buflen, 0, BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ - (*compress)(state, state->buf, 1, state->buflen); + if (force_generic) + blake2s_compress_generic(state, state->buf, 1, state->buflen); + else + blake2s_compress(state, state->buf, 1, state->buflen); cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); memcpy(out, state->h, state->outlen); } @@ -99,20 +109,20 @@ static inline int crypto_blake2s_init(struct shash_desc *desc) static inline int crypto_blake2s_update(struct shash_desc *desc, const u8 *in, unsigned int inlen, - blake2s_compress_t compress) + bool force_generic) { struct blake2s_state *state = shash_desc_ctx(desc); - __blake2s_update(state, in, inlen, compress); + __blake2s_update(state, in, inlen, force_generic); return 0; } static inline int crypto_blake2s_final(struct shash_desc *desc, u8 *out, - blake2s_compress_t compress) + bool force_generic) { struct blake2s_state *state = shash_desc_ctx(desc); - __blake2s_final(state, out, compress); + __blake2s_final(state, out, force_generic); return 0; } diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index 9364f79937b8..c71c09621c09 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -18,14 +18,14 @@ void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen) { - __blake2s_update(state, in, inlen, blake2s_compress); + __blake2s_update(state, in, inlen, false); } EXPORT_SYMBOL(blake2s_update); void blake2s_final(struct blake2s_state *state, u8 *out) { WARN_ON(IS_ENABLED(DEBUG) && !out); - __blake2s_final(state, out, blake2s_compress); + __blake2s_final(state, out, false); memzero_explicit(state, sizeof(*state)); } EXPORT_SYMBOL(blake2s_final); -- cgit v1.2.3 From fda17afc6166e975bec1197bd94cd2a3317bce3f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 7 Feb 2022 11:27:53 +0900 Subject: ata: libata-core: Fix ata_dev_config_cpr() The concurrent positioning ranges log page 47h is a general purpose log page and not a subpage of the indentify device log. Using ata_identify_page_supported() to test for concurrent positioning ranges support is thus wrong. ata_log_supported() must be used. Furthermore, unlike other advanced ATA features (e.g. NCQ priority), accesses to the concurrent positioning ranges log page are not gated by a feature bit from the device IDENTIFY data. Since many older drives react badly to the READ LOG EXT and/or READ LOG DMA EXT commands isued to read device log pages, avoid problems with older drives by limiting the concurrent positioning ranges support detection to drives implementing at least the ACS-4 ATA standard (major version 11). This additional condition effectively turns ata_dev_config_cpr() into a nop for older drives, avoiding problems in the field. Fixes: fe22e1c2f705 ("libata: support concurrent positioning ranges log") BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215519 Cc: stable@vger.kernel.org Reviewed-by: Hannes Reinecke Tested-by: Abderraouf Adjal Signed-off-by: Damien Le Moal --- drivers/ata/libata-core.c | 14 ++++++-------- include/linux/ata.h | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index e1b1dd215267..ba9273f80069 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2448,23 +2448,21 @@ static void ata_dev_config_cpr(struct ata_device *dev) struct ata_cpr_log *cpr_log = NULL; u8 *desc, *buf = NULL; - if (!ata_identify_page_supported(dev, - ATA_LOG_CONCURRENT_POSITIONING_RANGES)) + if (ata_id_major_version(dev->id) < 11 || + !ata_log_supported(dev, ATA_LOG_CONCURRENT_POSITIONING_RANGES)) goto out; /* - * Read IDENTIFY DEVICE data log, page 0x47 - * (concurrent positioning ranges). We can have at most 255 32B range - * descriptors plus a 64B header. + * Read the concurrent positioning ranges log (0x47). We can have at + * most 255 32B range descriptors plus a 64B header. */ buf_len = (64 + 255 * 32 + 511) & ~511; buf = kzalloc(buf_len, GFP_KERNEL); if (!buf) goto out; - err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, - ATA_LOG_CONCURRENT_POSITIONING_RANGES, - buf, buf_len >> 9); + err_mask = ata_read_log_page(dev, ATA_LOG_CONCURRENT_POSITIONING_RANGES, + 0, buf, buf_len >> 9); if (err_mask) goto out; diff --git a/include/linux/ata.h b/include/linux/ata.h index 199e47e97d64..21292b5bbb55 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -324,12 +324,12 @@ enum { ATA_LOG_NCQ_NON_DATA = 0x12, ATA_LOG_NCQ_SEND_RECV = 0x13, ATA_LOG_IDENTIFY_DEVICE = 0x30, + ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47, /* Identify device log pages: */ ATA_LOG_SECURITY = 0x06, ATA_LOG_SATA_SETTINGS = 0x08, ATA_LOG_ZONED_INFORMATION = 0x09, - ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47, /* Identify device SATA settings log:*/ ATA_LOG_DEVSLP_OFFSET = 0x30, -- cgit v1.2.3 From cb1f65c1e1424a4b5e4a86da8aa3b8fd8459c8ec Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 4 Feb 2022 18:35:22 +0100 Subject: PM: s2idle: ACPI: Fix wakeup interrupts handling After commit e3728b50cd9b ("ACPI: PM: s2idle: Avoid possible race related to the EC GPE") wakeup interrupts occurring immediately after the one discarded by acpi_s2idle_wake() may be missed. Moreover, if the SCI triggers again immediately after the rearming in acpi_s2idle_wake(), that wakeup may be missed too. The problem is that pm_system_irq_wakeup() only calls pm_system_wakeup() when pm_wakeup_irq is 0, but that's not the case any more after the interrupt causing acpi_s2idle_wake() to run until pm_wakeup_irq is cleared by the pm_wakeup_clear() call in s2idle_loop(). However, there may be wakeup interrupts occurring in that time frame and if that happens, they will be missed. To address that issue first move the clearing of pm_wakeup_irq to the point at which it is known that the interrupt causing acpi_s2idle_wake() to tun will be discarded, before rearming the SCI for wakeup. Moreover, because that only reduces the size of the time window in which the issue may manifest itself, allow pm_system_irq_wakeup() to register two second wakeup interrupts in a row and, when discarding the first one, replace it with the second one. [Of course, this assumes that only one wakeup interrupt can be discarded in one go, but currently that is the case and I am not aware of any plans to change that.] Fixes: e3728b50cd9b ("ACPI: PM: s2idle: Avoid possible race related to the EC GPE") Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 1 + drivers/base/power/wakeup.c | 41 ++++++++++++++++++++++++++++++++++------- include/linux/suspend.h | 4 ++-- kernel/power/main.c | 5 ++++- kernel/power/process.c | 2 +- kernel/power/suspend.c | 2 -- 6 files changed, 42 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index fac7c9d4c9a1..d4fbea91ab6b 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -758,6 +758,7 @@ bool acpi_s2idle_wake(void) return true; } + pm_wakeup_clear(acpi_sci_irq); rearm_wake_irq(acpi_sci_irq); } diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 99bda0da23a8..8666590201c9 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -34,7 +34,8 @@ suspend_state_t pm_suspend_target_state; bool events_check_enabled __read_mostly; /* First wakeup IRQ seen by the kernel in the last cycle. */ -unsigned int pm_wakeup_irq __read_mostly; +static unsigned int wakeup_irq[2] __read_mostly; +static DEFINE_RAW_SPINLOCK(wakeup_irq_lock); /* If greater than 0 and the system is suspending, terminate the suspend. */ static atomic_t pm_abort_suspend __read_mostly; @@ -942,19 +943,45 @@ void pm_system_cancel_wakeup(void) atomic_dec_if_positive(&pm_abort_suspend); } -void pm_wakeup_clear(bool reset) +void pm_wakeup_clear(unsigned int irq_number) { - pm_wakeup_irq = 0; - if (reset) + raw_spin_lock_irq(&wakeup_irq_lock); + + if (irq_number && wakeup_irq[0] == irq_number) + wakeup_irq[0] = wakeup_irq[1]; + else + wakeup_irq[0] = 0; + + wakeup_irq[1] = 0; + + raw_spin_unlock_irq(&wakeup_irq_lock); + + if (!irq_number) atomic_set(&pm_abort_suspend, 0); } void pm_system_irq_wakeup(unsigned int irq_number) { - if (pm_wakeup_irq == 0) { - pm_wakeup_irq = irq_number; + unsigned long flags; + + raw_spin_lock_irqsave(&wakeup_irq_lock, flags); + + if (wakeup_irq[0] == 0) + wakeup_irq[0] = irq_number; + else if (wakeup_irq[1] == 0) + wakeup_irq[1] = irq_number; + else + irq_number = 0; + + raw_spin_unlock_irqrestore(&wakeup_irq_lock, flags); + + if (irq_number) pm_system_wakeup(); - } +} + +unsigned int pm_wakeup_irq(void) +{ + return wakeup_irq[0]; } /** diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 3e8ecdebe601..300273ff40cc 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -497,14 +497,14 @@ extern void ksys_sync_helper(void); /* drivers/base/power/wakeup.c */ extern bool events_check_enabled; -extern unsigned int pm_wakeup_irq; extern suspend_state_t pm_suspend_target_state; extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); extern void pm_system_cancel_wakeup(void); -extern void pm_wakeup_clear(bool reset); +extern void pm_wakeup_clear(unsigned int irq_number); extern void pm_system_irq_wakeup(unsigned int irq_number); +extern unsigned int pm_wakeup_irq(void); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); extern void pm_wakep_autosleep_enabled(bool set); diff --git a/kernel/power/main.c b/kernel/power/main.c index 44169f3081fd..7e646079fbeb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -504,7 +504,10 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; + if (!pm_wakeup_irq()) + return -ENODATA; + + return sprintf(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); diff --git a/kernel/power/process.c b/kernel/power/process.c index b7e7798637b8..11b570fcf049 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -134,7 +134,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - pm_wakeup_clear(true); + pm_wakeup_clear(0); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 80cc1f0f502b..6fcdee7e87a5 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -136,8 +136,6 @@ static void s2idle_loop(void) break; } - pm_wakeup_clear(false); - s2idle_enter(); } -- cgit v1.2.3 From cfc56f85e72f5b9c5c5be26dc2b16518d36a7868 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 7 Feb 2022 18:13:18 +0100 Subject: net: do not keep the dst cache when uncloning an skb dst and its metadata When uncloning an skb dst and its associated metadata a new dst+metadata is allocated and the tunnel information from the old metadata is copied over there. The issue is the tunnel metadata has references to cached dst, which are copied along the way. When a dst+metadata refcount drops to 0 the metadata is freed including the cached dst entries. As they are also referenced in the initial dst+metadata, this ends up in UaFs. In practice the above did not happen because of another issue, the dst+metadata was never freed because its refcount never dropped to 0 (this will be fixed in a subsequent patch). Fix this by initializing the dst cache after copying the tunnel information from the old metadata to also unshare the dst cache. Fixes: d71785ffc7e7 ("net: add dst_cache to ovs vxlan lwtunnel") Cc: Paolo Abeni Reported-by: Vlad Buslov Tested-by: Vlad Buslov Signed-off-by: Antoine Tenart Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 14efa0ded75d..b997e0c1e362 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -123,6 +123,19 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, sizeof(struct ip_tunnel_info) + md_size); +#ifdef CONFIG_DST_CACHE + /* Unclone the dst cache if there is one */ + if (new_md->u.tun_info.dst_cache.cache) { + int ret; + + ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC); + if (ret) { + metadata_dst_free(new_md); + return ERR_PTR(ret); + } + } +#endif + skb_dst_drop(skb); dst_hold(&new_md->dst); skb_dst_set(skb, &new_md->dst); -- cgit v1.2.3 From 9eeabdf17fa0ab75381045c867c370f4cc75a613 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 7 Feb 2022 18:13:19 +0100 Subject: net: fix a memleak when uncloning an skb dst and its metadata When uncloning an skb dst and its associated metadata, a new dst+metadata is allocated and later replaces the old one in the skb. This is helpful to have a non-shared dst+metadata attached to a specific skb. The issue is the uncloned dst+metadata is initialized with a refcount of 1, which is increased to 2 before attaching it to the skb. When tun_dst_unclone returns, the dst+metadata is only referenced from a single place (the skb) while its refcount is 2. Its refcount will never drop to 0 (when the skb is consumed), leading to a memory leak. Fix this by removing the call to dst_hold in tun_dst_unclone, as the dst+metadata refcount is already 1. Fixes: fc4099f17240 ("openvswitch: Fix egress tunnel info.") Cc: Pravin B Shelar Reported-by: Vlad Buslov Tested-by: Vlad Buslov Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index b997e0c1e362..adab27ba1ecb 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -137,7 +137,6 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) #endif skb_dst_drop(skb); - dst_hold(&new_md->dst); skb_dst_set(skb, &new_md->dst); return new_md; } -- cgit v1.2.3 From c306d737691ef84305d4ed0d302c63db2932f0bb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Jan 2022 15:57:45 -0500 Subject: NFSD: Deprecate NFS_OFFSET_MAX NFS_OFFSET_MAX was introduced way back in Linux v2.3.y before there was a kernel-wide OFFSET_MAX value. As a clean up, replace the last few uses of it with its generic equivalent, and get rid of it. Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 2 +- fs/nfsd/nfs4xdr.c | 2 +- include/linux/nfs.h | 8 -------- 3 files changed, 2 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2e47a07029f1..0293b8d65f10 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -1060,7 +1060,7 @@ svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name, return false; /* cookie */ resp->cookie_offset = dirlist->len; - if (xdr_stream_encode_u64(xdr, NFS_OFFSET_MAX) < 0) + if (xdr_stream_encode_u64(xdr, OFFSET_MAX) < 0) return false; return true; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f5e3430bb6ff..714a3a3bd50c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3495,7 +3495,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, p = xdr_reserve_space(xdr, 3*4 + namlen); if (!p) goto fail; - p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ + p = xdr_encode_hyper(p, OFFSET_MAX); /* offset of next entry */ p = xdr_encode_array(p, name, namlen); /* name length & name */ nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen); diff --git a/include/linux/nfs.h b/include/linux/nfs.h index 0dc7ad38a0da..b06375e88e58 100644 --- a/include/linux/nfs.h +++ b/include/linux/nfs.h @@ -36,14 +36,6 @@ static inline void nfs_copy_fh(struct nfs_fh *target, const struct nfs_fh *sourc memcpy(target->data, source->data, source->size); } - -/* - * This is really a general kernel constant, but since nothing like - * this is defined in the kernel headers, I have to do it here. - */ -#define NFS_OFFSET_MAX ((__s64)((~(__u64)0) >> 1)) - - enum nfs3_stable_how { NFS_UNSTABLE = 0, NFS_DATA_SYNC = 1, -- cgit v1.2.3 From 0764db9b49c932b89ee4d9e3236dff4bb07b4a66 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 11 Feb 2022 16:32:32 -0800 Subject: mm: memcg: synchronize objcg lists with a dedicated spinlock Alexander reported a circular lock dependency revealed by the mmap1 ltp test: LOCKDEP_CIRCULAR (suite: ltp, case: mtest06 (mmap1)) WARNING: possible circular locking dependency detected 5.17.0-20220113.rc0.git0.f2211f194038.300.fc35.s390x+debug #1 Not tainted ------------------------------------------------------ mmap1/202299 is trying to acquire lock: 00000001892c0188 (css_set_lock){..-.}-{2:2}, at: obj_cgroup_release+0x4a/0xe0 but task is already holding lock: 00000000ca3b3818 (&sighand->siglock){-.-.}-{2:2}, at: force_sig_info_to_task+0x38/0x180 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&sighand->siglock){-.-.}-{2:2}: __lock_acquire+0x604/0xbd8 lock_acquire.part.0+0xe2/0x238 lock_acquire+0xb0/0x200 _raw_spin_lock_irqsave+0x6a/0xd8 __lock_task_sighand+0x90/0x190 cgroup_freeze_task+0x2e/0x90 cgroup_migrate_execute+0x11c/0x608 cgroup_update_dfl_csses+0x246/0x270 cgroup_subtree_control_write+0x238/0x518 kernfs_fop_write_iter+0x13e/0x1e0 new_sync_write+0x100/0x190 vfs_write+0x22c/0x2d8 ksys_write+0x6c/0xf8 __do_syscall+0x1da/0x208 system_call+0x82/0xb0 -> #0 (css_set_lock){..-.}-{2:2}: check_prev_add+0xe0/0xed8 validate_chain+0x736/0xb20 __lock_acquire+0x604/0xbd8 lock_acquire.part.0+0xe2/0x238 lock_acquire+0xb0/0x200 _raw_spin_lock_irqsave+0x6a/0xd8 obj_cgroup_release+0x4a/0xe0 percpu_ref_put_many.constprop.0+0x150/0x168 drain_obj_stock+0x94/0xe8 refill_obj_stock+0x94/0x278 obj_cgroup_charge+0x164/0x1d8 kmem_cache_alloc+0xac/0x528 __sigqueue_alloc+0x150/0x308 __send_signal+0x260/0x550 send_signal+0x7e/0x348 force_sig_info_to_task+0x104/0x180 force_sig_fault+0x48/0x58 __do_pgm_check+0x120/0x1f0 pgm_check_handler+0x11e/0x180 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sighand->siglock); lock(css_set_lock); lock(&sighand->siglock); lock(css_set_lock); *** DEADLOCK *** 2 locks held by mmap1/202299: #0: 00000000ca3b3818 (&sighand->siglock){-.-.}-{2:2}, at: force_sig_info_to_task+0x38/0x180 #1: 00000001892ad560 (rcu_read_lock){....}-{1:2}, at: percpu_ref_put_many.constprop.0+0x0/0x168 stack backtrace: CPU: 15 PID: 202299 Comm: mmap1 Not tainted 5.17.0-20220113.rc0.git0.f2211f194038.300.fc35.s390x+debug #1 Hardware name: IBM 3906 M04 704 (LPAR) Call Trace: dump_stack_lvl+0x76/0x98 check_noncircular+0x136/0x158 check_prev_add+0xe0/0xed8 validate_chain+0x736/0xb20 __lock_acquire+0x604/0xbd8 lock_acquire.part.0+0xe2/0x238 lock_acquire+0xb0/0x200 _raw_spin_lock_irqsave+0x6a/0xd8 obj_cgroup_release+0x4a/0xe0 percpu_ref_put_many.constprop.0+0x150/0x168 drain_obj_stock+0x94/0xe8 refill_obj_stock+0x94/0x278 obj_cgroup_charge+0x164/0x1d8 kmem_cache_alloc+0xac/0x528 __sigqueue_alloc+0x150/0x308 __send_signal+0x260/0x550 send_signal+0x7e/0x348 force_sig_info_to_task+0x104/0x180 force_sig_fault+0x48/0x58 __do_pgm_check+0x120/0x1f0 pgm_check_handler+0x11e/0x180 INFO: lockdep is turned off. In this example a slab allocation from __send_signal() caused a refilling and draining of a percpu objcg stock, resulted in a releasing of another non-related objcg. Objcg release path requires taking the css_set_lock, which is used to synchronize objcg lists. This can create a circular dependency with the sighandler lock, which is taken with the locked css_set_lock by the freezer code (to freeze a task). In general it seems that using css_set_lock to synchronize objcg lists makes any slab allocations and deallocation with the locked css_set_lock and any intervened locks risky. To fix the problem and make the code more robust let's stop using css_set_lock to synchronize objcg lists and use a new dedicated spinlock instead. Link: https://lkml.kernel.org/r/Yfm1IHmoGdyUR81T@carbon.dhcp.thefacebook.com Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API") Signed-off-by: Roman Gushchin Reported-by: Alexander Egorenkov Tested-by: Alexander Egorenkov Reviewed-by: Waiman Long Acked-by: Tejun Heo Reviewed-by: Shakeel Butt Reviewed-by: Jeremy Linton Tested-by: Jeremy Linton Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++-- mm/memcontrol.c | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b72d75141e12..0abbd685703b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -219,7 +219,7 @@ struct obj_cgroup { struct mem_cgroup *memcg; atomic_t nr_charged_bytes; union { - struct list_head list; + struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; }; @@ -315,7 +315,8 @@ struct mem_cgroup { #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; struct obj_cgroup __rcu *objcg; - struct list_head objcg_list; /* list of inherited objcgs */ + /* list of inherited objcgs, protected by objcg_lock */ + struct list_head objcg_list; #endif MEMCG_PADDING(_pad2_); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 09d342c7cbd0..36e9f38c919d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -254,7 +254,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) } #ifdef CONFIG_MEMCG_KMEM -extern spinlock_t css_set_lock; +static DEFINE_SPINLOCK(objcg_lock); bool mem_cgroup_kmem_disabled(void) { @@ -298,9 +298,9 @@ static void obj_cgroup_release(struct percpu_ref *ref) if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); - spin_lock_irqsave(&css_set_lock, flags); + spin_lock_irqsave(&objcg_lock, flags); list_del(&objcg->list); - spin_unlock_irqrestore(&css_set_lock, flags); + spin_unlock_irqrestore(&objcg_lock, flags); percpu_ref_exit(ref); kfree_rcu(objcg, rcu); @@ -332,7 +332,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, objcg = rcu_replace_pointer(memcg->objcg, NULL, true); - spin_lock_irq(&css_set_lock); + spin_lock_irq(&objcg_lock); /* 1) Ready to reparent active objcg. */ list_add(&objcg->list, &memcg->objcg_list); @@ -342,7 +342,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, /* 3) Move already reparented objcgs to the parent's list */ list_splice(&memcg->objcg_list, &parent->objcg_list); - spin_unlock_irq(&css_set_lock); + spin_unlock_irq(&objcg_lock); percpu_ref_kill(&objcg->refcnt); } -- cgit v1.2.3 From 8913c61001482378d4ed8cc577b17c1ba3e847e4 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Fri, 11 Feb 2022 16:32:35 -0800 Subject: kfence: make test case compatible with run time set sample interval The parameter kfence_sample_interval can be set via boot parameter and late shell command, which is convenient for automated tests and KFENCE parameter optimization. However, KFENCE test case just uses compile-time CONFIG_KFENCE_SAMPLE_INTERVAL, which will make KFENCE test case not run as users desired. Export kfence_sample_interval, so that KFENCE test case can use run-time-set sample interval. Link: https://lkml.kernel.org/r/20220207034432.185532-1-liupeng256@huawei.com Signed-off-by: Peng Liu Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Sumit Semwal Cc: Christian Knig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfence.h | 2 ++ mm/kfence/core.c | 3 ++- mm/kfence/kfence_test.c | 8 ++++---- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 4b5e3679a72c..f49e64222628 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -17,6 +17,8 @@ #include #include +extern unsigned long kfence_sample_interval; + /* * We allocate an even number of pages, as it simplifies calculations to map * address to metadata indices; effectively, the very first page serves as an diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 5ad40e3add45..13128fa13062 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -47,7 +47,8 @@ static bool kfence_enabled __read_mostly; -static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; +unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; +EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */ #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index a22b1af85577..50dbb815a2a8 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -268,13 +268,13 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat * 100x the sample interval should be more than enough to ensure we get * a KFENCE allocation eventually. */ - timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval); /* * Especially for non-preemption kernels, ensure the allocation-gate * timer can catch up: after @resched_after, every failed allocation * attempt yields, to ensure the allocation-gate timer is scheduled. */ - resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL); + resched_after = jiffies + msecs_to_jiffies(kfence_sample_interval); do { if (test_cache) alloc = kmem_cache_alloc(test_cache, gfp); @@ -608,7 +608,7 @@ static void test_gfpzero(struct kunit *test) int i; /* Skip if we think it'd take too long. */ - KFENCE_TEST_REQUIRES(test, CONFIG_KFENCE_SAMPLE_INTERVAL <= 100); + KFENCE_TEST_REQUIRES(test, kfence_sample_interval <= 100); setup_test_cache(test, size, 0, NULL); buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); @@ -739,7 +739,7 @@ static void test_memcache_alloc_bulk(struct kunit *test) * 100x the sample interval should be more than enough to ensure we get * a KFENCE allocation eventually. */ - timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval); do { void *objects[100]; int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects), -- cgit v1.2.3