summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.h6
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/v9fs.h2
-rw-r--r--fs/9p/vfs_dir.c5
-rw-r--r--fs/9p/vfs_file.c5
-rw-r--r--fs/9p/vfs_inode.c6
-rw-r--r--fs/9p/vfs_inode_dotl.c1
-rw-r--r--fs/autofs/Kconfig12
-rw-r--r--fs/btrfs/block-group.c82
-rw-r--r--fs/btrfs/block-group.h11
-rw-r--r--fs/btrfs/block-rsv.c5
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/delayed-inode.c5
-rw-r--r--fs/btrfs/delayed-inode.h1
-rw-r--r--fs/btrfs/disk-io.c20
-rw-r--r--fs/btrfs/extent-tree.c5
-rw-r--r--fs/btrfs/extent_io.c38
-rw-r--r--fs/btrfs/extent_map.c6
-rw-r--r--fs/btrfs/free-space-tree.c24
-rw-r--r--fs/btrfs/inode.c218
-rw-r--r--fs/btrfs/qgroup.c1
-rw-r--r--fs/btrfs/raid56.c11
-rw-r--r--fs/btrfs/relocation.c45
-rw-r--r--fs/btrfs/scrub.c3
-rw-r--r--fs/btrfs/transaction.c10
-rw-r--r--fs/btrfs/tree-checker.c14
-rw-r--r--fs/btrfs/volumes.c20
-rw-r--r--fs/btrfs/zoned.c3
-rw-r--r--fs/ceph/dir.c5
-rw-r--r--fs/ceph/mds_client.c4
-rw-r--r--fs/ceph/mds_client.h5
-rw-r--r--fs/ceph/metric.c2
-rw-r--r--fs/ceph/super.c10
-rw-r--r--fs/coda/dir.c20
-rw-r--r--fs/erofs/decompressor.c37
-rw-r--r--fs/erofs/inode.c3
-rw-r--r--fs/erofs/super.c2
-rw-r--r--fs/erofs/zdata.c11
-rw-r--r--fs/exfat/balloc.c6
-rw-r--r--fs/exfat/dir.c39
-rw-r--r--fs/exportfs/expfs.c2
-rw-r--r--fs/ext4/mballoc.c172
-rw-r--r--fs/ext4/xattr.c14
-rw-r--r--fs/file.c24
-rw-r--r--fs/fs_context.c1
-rw-r--r--fs/fsopen.c106
-rw-r--r--fs/fuse/dir.c4
-rw-r--r--fs/fuse/inode.c8
-rw-r--r--fs/fuse/ioctl.c21
-rw-r--r--fs/gfs2/file.c4
-rw-r--r--fs/gfs2/trans.c14
-rw-r--r--fs/inode.c3
-rw-r--r--fs/iomap/buffered-io.c4
-rw-r--r--fs/jbd2/checkpoint.c277
-rw-r--r--fs/jbd2/commit.c3
-rw-r--r--fs/jbd2/transaction.c40
-rw-r--r--fs/jfs/namei.c3
-rw-r--r--fs/nfs/direct.c26
-rw-r--r--fs/nfs/nfs42proc.c5
-rw-r--r--fs/nfs/nfs4proc.c14
-rw-r--r--fs/nfs/sysfs.c4
-rw-r--r--fs/nfsd/nfs4state.c4
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/nilfs2/inode.c8
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/nls/nls_base.c4
-rw-r--r--fs/ntfs/dir.c3
-rw-r--r--fs/ocfs2/file.c5
-rw-r--r--fs/open.c2
-rw-r--r--fs/overlayfs/readdir.c3
-rw-r--r--fs/overlayfs/super.c2
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/kcore.c30
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/readdir.c68
-rw-r--r--fs/smb/client/cifs_debug.c10
-rw-r--r--fs/smb/client/cifsfs.h4
-rw-r--r--fs/smb/client/cifsglob.h3
-rw-r--r--fs/smb/client/cifssmb.c5
-rw-r--r--fs/smb/client/connect.c35
-rw-r--r--fs/smb/client/dfs.c32
-rw-r--r--fs/smb/client/file.c29
-rw-r--r--fs/smb/client/fs_context.c4
-rw-r--r--fs/smb/client/ioctl.c22
-rw-r--r--fs/smb/client/misc.c1
-rw-r--r--fs/smb/client/sess.c4
-rw-r--r--fs/smb/client/smb2ops.c8
-rw-r--r--fs/smb/client/smb2pdu.c3
-rw-r--r--fs/smb/client/smb2transport.c2
-rw-r--r--fs/smb/server/ksmbd_netlink.h3
-rw-r--r--fs/smb/server/server.c7
-rw-r--r--fs/smb/server/smb2misc.c10
-rw-r--r--fs/smb/server/smb2pdu.c56
-rw-r--r--fs/smb/server/smb_common.c19
-rw-r--r--fs/smb/server/smb_common.h2
-rw-r--r--fs/smb/server/vfs.c65
-rw-r--r--fs/smb/server/vfs.h4
-rw-r--r--fs/splice.c2
-rw-r--r--fs/super.c64
-rw-r--r--fs/vboxsf/dir.c3
-rw-r--r--fs/vboxsf/shfl_hostintf.h6
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h75
-rw-r--r--fs/xfs/libxfs/xfs_fs.h4
-rw-r--r--fs/xfs/xfs_ondisk.h5
-rw-r--r--fs/zonefs/file.c111
-rw-r--r--fs/zonefs/super.c9
-rw-r--r--fs/zonefs/zonefs.h2
110 files changed, 1316 insertions, 908 deletions
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 0c51889a60b3..29281b7c3887 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -46,8 +46,8 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
* NOTE: these are set after open so only reflect 9p client not
* underlying file system on server.
*/
-static inline void v9fs_fid_add_modes(struct p9_fid *fid, int s_flags,
- int s_cache, unsigned int f_flags)
+static inline void v9fs_fid_add_modes(struct p9_fid *fid, unsigned int s_flags,
+ unsigned int s_cache, unsigned int f_flags)
{
if (fid->qid.type != P9_QTFILE)
return;
@@ -57,7 +57,7 @@ static inline void v9fs_fid_add_modes(struct p9_fid *fid, int s_flags,
(s_flags & V9FS_DIRECT_IO) || (f_flags & O_DIRECT)) {
fid->mode |= P9L_DIRECT; /* no read or write cache */
} else if ((!(s_cache & CACHE_WRITEBACK)) ||
- (f_flags & O_DSYNC) | (s_flags & V9FS_SYNC)) {
+ (f_flags & O_DSYNC) || (s_flags & V9FS_SYNC)) {
fid->mode |= P9L_NOWRITECACHE;
}
}
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index c7f774fe398f..d525957594b6 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -545,8 +545,6 @@ void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
p9_client_begin_disconnect(v9ses->clnt);
}
-extern int v9fs_error_init(void);
-
static struct kobject *v9fs_kobj;
#ifdef CONFIG_9P_FSCACHE
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 06a2514f0d88..698c43dd5dc8 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -108,7 +108,7 @@ enum p9_cache_bits {
struct v9fs_session_info {
/* options */
- unsigned char flags;
+ unsigned int flags;
unsigned char nodev;
unsigned short debug;
unsigned int afid;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 45b684b7d8d7..4102759a5cb5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -208,7 +208,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
struct p9_fid *fid;
__le32 version;
loff_t i_size;
- int retval = 0;
+ int retval = 0, put_err;
fid = filp->private_data;
p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
@@ -221,7 +221,8 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
spin_lock(&inode->i_lock);
hlist_del(&fid->ilist);
spin_unlock(&inode->i_lock);
- retval = p9_fid_put(fid);
+ put_err = p9_fid_put(fid);
+ retval = retval < 0 ? retval : put_err;
}
if ((filp->f_mode & FMODE_WRITE)) {
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2996fb00387f..11cd8d23f6f2 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -505,10 +505,7 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
p9_debug(P9_DEBUG_MMAP, "filp :%p\n", filp);
if (!(v9ses->cache & CACHE_WRITEBACK)) {
- p9_debug(P9_DEBUG_CACHE, "(no mmap mode)");
- if (vma->vm_flags & VM_MAYSHARE)
- return -ENODEV;
- invalidate_inode_pages2(filp->f_mapping);
+ p9_debug(P9_DEBUG_CACHE, "(read-only mmap mode)");
return generic_file_readonly_mmap(filp, vma);
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index d24d1f20e922..0d28ecf668d0 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -163,7 +163,6 @@ int v9fs_uflags2omode(int uflags, int extended)
{
int ret;
- ret = 0;
switch (uflags&3) {
default:
case O_RDONLY:
@@ -603,7 +602,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
- err = 0;
name = dentry->d_name.name;
dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
@@ -815,8 +813,6 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
if (!(flags & O_CREAT) || d_really_is_positive(dentry))
return finish_no_open(file, res);
- err = 0;
-
v9ses = v9fs_inode2v9ses(dir);
perm = unixmode2p9mode(v9ses, mode);
p9_omode = v9fs_uflags2omode(flags, v9fs_proto_dotu(v9ses));
@@ -912,7 +908,6 @@ v9fs_vfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
return -EINVAL;
p9_debug(P9_DEBUG_VFS, "\n");
- retval = 0;
old_inode = d_inode(old_dentry);
new_inode = d_inode(new_dentry);
v9ses = v9fs_inode2v9ses(old_inode);
@@ -1066,7 +1061,6 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
if (retval)
return retval;
- retval = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
if (iattr->ia_valid & ATTR_FILE) {
fid = iattr->ia_file->private_data;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 8e8d5d2a13d8..1312f68965ac 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -366,7 +366,6 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
struct posix_acl *dacl = NULL, *pacl = NULL;
p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
- err = 0;
v9ses = v9fs_inode2v9ses(dir);
omode |= S_IFDIR;
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
index 3b3a6b1423c6..54c12d9484cb 100644
--- a/fs/autofs/Kconfig
+++ b/fs/autofs/Kconfig
@@ -1,18 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
-config AUTOFS4_FS
- tristate "Old Kconfig name for Kernel automounter support"
- select AUTOFS_FS
- help
- This name exists for people to just automatically pick up the
- new name of the autofs Kconfig option. All it does is select
- the new option name.
-
- It will go away in a release or two as people have
- transitioned to just plain AUTOFS_FS.
-
config AUTOFS_FS
tristate "Kernel automounter support (supports v3, v4 and v5)"
- default n
help
The automounter is a tool to automatically mount remote file systems
on demand. This implementation is partially kernel-based to reduce
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 48ae509f2ac2..82324c327a50 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -441,13 +441,23 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes)
{
struct btrfs_caching_control *caching_ctl;
+ int progress;
caching_ctl = btrfs_get_caching_control(cache);
if (!caching_ctl)
return;
+ /*
+ * We've already failed to allocate from this block group, so even if
+ * there's enough space in the block group it isn't contiguous enough to
+ * allow for an allocation, so wait for at least the next wakeup tick,
+ * or for the thing to be done.
+ */
+ progress = atomic_read(&caching_ctl->progress);
+
wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
- (cache->free_space_ctl->free_space >= num_bytes));
+ (progress != atomic_read(&caching_ctl->progress) &&
+ (cache->free_space_ctl->free_space >= num_bytes)));
btrfs_put_caching_control(caching_ctl);
}
@@ -499,12 +509,16 @@ static void fragment_free_space(struct btrfs_block_group *block_group)
* used yet since their free space will be released as soon as the transaction
* commits.
*/
-u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
+int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end,
+ u64 *total_added_ret)
{
struct btrfs_fs_info *info = block_group->fs_info;
- u64 extent_start, extent_end, size, total_added = 0;
+ u64 extent_start, extent_end, size;
int ret;
+ if (total_added_ret)
+ *total_added_ret = 0;
+
while (start < end) {
ret = find_first_extent_bit(&info->excluded_extents, start,
&extent_start, &extent_end,
@@ -517,10 +531,12 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
start = extent_end + 1;
} else if (extent_start > start && extent_start < end) {
size = extent_start - start;
- total_added += size;
ret = btrfs_add_free_space_async_trimmed(block_group,
start, size);
- BUG_ON(ret); /* -ENOMEM or logic error */
+ if (ret)
+ return ret;
+ if (total_added_ret)
+ *total_added_ret += size;
start = extent_end + 1;
} else {
break;
@@ -529,13 +545,15 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
if (start < end) {
size = end - start;
- total_added += size;
ret = btrfs_add_free_space_async_trimmed(block_group, start,
size);
- BUG_ON(ret); /* -ENOMEM or logic error */
+ if (ret)
+ return ret;
+ if (total_added_ret)
+ *total_added_ret += size;
}
- return total_added;
+ return 0;
}
/*
@@ -779,8 +797,13 @@ next:
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY) {
- total_found += add_new_free_space(block_group, last,
- key.objectid);
+ u64 space_added;
+
+ ret = add_new_free_space(block_group, last, key.objectid,
+ &space_added);
+ if (ret)
+ goto out;
+ total_found += space_added;
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
fs_info->nodesize;
@@ -789,17 +812,18 @@ next:
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
- if (wakeup)
+ if (wakeup) {
+ atomic_inc(&caching_ctl->progress);
wake_up(&caching_ctl->wait);
+ }
}
}
path->slots[0]++;
}
- ret = 0;
-
- total_found += add_new_free_space(block_group, last,
- block_group->start + block_group->length);
+ ret = add_new_free_space(block_group, last,
+ block_group->start + block_group->length,
+ NULL);
out:
btrfs_free_path(path);
return ret;
@@ -898,6 +922,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
refcount_set(&caching_ctl->count, 2);
+ atomic_set(&caching_ctl->progress, 0);
btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
spin_lock(&cache->lock);
@@ -1640,13 +1665,14 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
- trace_btrfs_add_unused_block_group(bg);
spin_lock(&fs_info->unused_bgs_lock);
if (list_empty(&bg->bg_list)) {
btrfs_get_block_group(bg);
+ trace_btrfs_add_unused_block_group(bg);
list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
- } else {
+ } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
/* Pull out the block group from the reclaim_bgs list. */
+ trace_btrfs_add_unused_block_group(bg);
list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
}
spin_unlock(&fs_info->unused_bgs_lock);
@@ -2087,6 +2113,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
/* Shouldn't have super stripes in sequential zones */
if (zoned && nr) {
+ kfree(logical);
btrfs_err(fs_info,
"zoned: block group %llu must not contain super block",
cache->start);
@@ -2292,9 +2319,11 @@ static int read_one_block_group(struct btrfs_fs_info *info,
btrfs_free_excluded_extents(cache);
} else if (cache->used == 0) {
cache->cached = BTRFS_CACHE_FINISHED;
- add_new_free_space(cache, cache->start,
- cache->start + cache->length);
+ ret = add_new_free_space(cache, cache->start,
+ cache->start + cache->length, NULL);
btrfs_free_excluded_extents(cache);
+ if (ret)
+ goto error;
}
ret = btrfs_add_block_group_cache(info, cache);
@@ -2668,6 +2697,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
next:
btrfs_delayed_refs_rsv_release(fs_info, 1);
list_del_init(&block_group->bg_list);
+ clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
}
btrfs_trans_release_chunk_metadata(trans);
}
@@ -2707,6 +2737,13 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
if (!cache)
return ERR_PTR(-ENOMEM);
+ /*
+ * Mark it as new before adding it to the rbtree of block groups or any
+ * list, so that no other task finds it and calls btrfs_mark_bg_unused()
+ * before the new flag is set.
+ */
+ set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
+
cache->length = size;
set_free_space_tree_thresholds(cache);
cache->flags = type;
@@ -2730,9 +2767,12 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
return ERR_PTR(ret);
}
- add_new_free_space(cache, chunk_offset, chunk_offset + size);
-
+ ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
btrfs_free_excluded_extents(cache);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ return ERR_PTR(ret);
+ }
/*
* Ensure the corresponding space_info object is created and
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index f204addc3fe8..74b61e663028 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -70,6 +70,11 @@ enum btrfs_block_group_flags {
BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
/* Indicate that the block group is placed on a sequential zone */
BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
+ /*
+ * Indicate that block group is in the list of new block groups of a
+ * transaction.
+ */
+ BLOCK_GROUP_FLAG_NEW,
};
enum btrfs_caching_type {
@@ -85,6 +90,8 @@ struct btrfs_caching_control {
wait_queue_head_t wait;
struct btrfs_work work;
struct btrfs_block_group *block_group;
+ /* Track progress of caching during allocation. */
+ atomic_t progress;
refcount_t count;
};
@@ -284,8 +291,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache);
-u64 add_new_free_space(struct btrfs_block_group *block_group,
- u64 start, u64 end);
+int add_new_free_space(struct btrfs_block_group *block_group,
+ u64 start, u64 end, u64 *total_added_ret);
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 6279d200cf83..77684c5e0c8b 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -349,6 +349,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
}
read_unlock(&fs_info->global_root_lock);
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
+ num_bytes += btrfs_root_used(&fs_info->block_group_root->root_item);
+ min_items++;
+ }
+
/*
* But we also want to reserve enough space so we can do the fallback
* global reserve for an unlink, which is an additional
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f2d2b313bde5..9419f4e37a58 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -443,6 +443,7 @@ struct btrfs_drop_extents_args {
struct btrfs_file_private {
void *filldir_buf;
+ u64 last_index;
struct extent_state *llseek_cached_state;
};
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 6f8c0a9b2017..d37c066f2df7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1632,6 +1632,7 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
}
bool btrfs_readdir_get_delayed_items(struct inode *inode,
+ u64 last_index,
struct list_head *ins_list,
struct list_head *del_list)
{
@@ -1651,14 +1652,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
- while (item) {
+ while (item && item->index <= last_index) {
refcount_inc(&item->refs);
list_add_tail(&item->readdir_list, ins_list);
item = __btrfs_next_delayed_item(item);
}
item = __btrfs_first_delayed_deletion_item(delayed_node);
- while (item) {
+ while (item && item->index <= last_index) {
refcount_inc(&item->refs);
list_add_tail(&item->readdir_list, del_list);
item = __btrfs_next_delayed_item(item);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 4f21daa3dbc7..dc1085b2a397 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -148,6 +148,7 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info);
/* Used for readdir() */
bool btrfs_readdir_get_delayed_items(struct inode *inode,
+ u64 last_index,
struct list_head *ins_list,
struct list_head *del_list);
void btrfs_readdir_put_delayed_items(struct inode *inode,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7513388b0567..a9a2c5446c18 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1103,7 +1103,8 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
btrfs_drew_lock_init(&root->snapshot_lock);
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
- !btrfs_is_data_reloc_root(root)) {
+ !btrfs_is_data_reloc_root(root) &&
+ is_fstree(root->root_key.objectid)) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1300,6 +1301,16 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
root = btrfs_get_global_root(fs_info, objectid);
if (root)
return root;
+
+ /*
+ * If we're called for non-subvolume trees, and above function didn't
+ * find one, do not try to read it from disk.
+ *
+ * This is namely for free-space-tree and quota tree, which can change
+ * at runtime and should only be grabbed from fs_info.
+ */
+ if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+ return ERR_PTR(-ENOENT);
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
@@ -3438,11 +3449,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* For devices supporting discard turn on discard=async automatically,
* unless it's already set or disabled. This could be turned off by
* nodiscard for the same mount.
+ *
+ * The zoned mode piggy backs on the discard functionality for
+ * resetting a zone. There is no reason to delay the zone reset as it is
+ * fast enough. So, do not enable async discard for zoned mode.
*/
if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
btrfs_test_opt(fs_info, NODISCARD)) &&
- fs_info->fs_devices->discardable) {
+ fs_info->fs_devices->discardable &&
+ !btrfs_is_zoned(fs_info)) {
btrfs_set_and_info(fs_info, DISCARD_ASYNC,
"auto enabling async discard");
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 911908ea5f6f..f396a9afa403 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4310,8 +4310,11 @@ have_block_group:
ret = 0;
}
- if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
+ if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
+ if (!cache_block_group_error)
+ cache_block_group_error = -EIO;
goto loop;
+ }
if (!find_free_extent_check_size_class(ffe_ctl, block_group))
goto loop;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a91d5ad27984..90ad3006ef3a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -902,7 +902,30 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
size -= len;
pg_offset += len;
disk_bytenr += len;
- bio_ctrl->len_to_oe_boundary -= len;
+
+ /*
+ * len_to_oe_boundary defaults to U32_MAX, which isn't page or
+ * sector aligned. alloc_new_bio() then sets it to the end of
+ * our ordered extent for writes into zoned devices.
+ *
+ * When len_to_oe_boundary is tracking an ordered extent, we
+ * trust the ordered extent code to align things properly, and
+ * the check above to cap our write to the ordered extent
+ * boundary is correct.
+ *
+ * When len_to_oe_boundary is U32_MAX, the cap above would
+ * result in a 4095 byte IO for the last page right before
+ * we hit the bio limit of UINT_MAX. bio_add_page() has all
+ * the checks required to make sure we don't overflow the bio,
+ * and we should just ignore len_to_oe_boundary completely
+ * unless we're using it to track an ordered extent.
+ *
+ * It's pretty hard to make a bio sized U32_MAX, but it can
+ * happen when the page cache is able to feed us contiguous
+ * pages for large extents.
+ */
+ if (bio_ctrl->len_to_oe_boundary != U32_MAX)
+ bio_ctrl->len_to_oe_boundary -= len;
/* Ordered extent boundary: move on to a new bio. */
if (bio_ctrl->len_to_oe_boundary == 0)
@@ -2145,6 +2168,12 @@ retry:
continue;
}
+ if (!folio_test_dirty(folio)) {
+ /* Someone wrote it for us. */
+ folio_unlock(folio);
+ continue;
+ }
+
if (wbc->sync_mode != WB_SYNC_NONE) {
if (folio_test_writeback(folio))
submit_write_bio(bio_ctrl, 0);
@@ -2164,11 +2193,12 @@ retry:
}
/*
- * the filesystem may choose to bump up nr_to_write.
+ * The filesystem may choose to bump up nr_to_write.
* We have to make sure to honor the new nr_to_write
- * at any time
+ * at any time.
*/
- nr_to_write_done = wbc->nr_to_write <= 0;
+ nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE &&
+ wbc->nr_to_write <= 0);
}
folio_batch_release(&fbatch);
cond_resched();
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 0cdb3e86f29b..a6d8368ed0ed 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -760,8 +760,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
start = em_end;
- if (end != (u64)-1)
- len = start + len - em_end;
goto next;
}
@@ -829,8 +827,8 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
if (!split)
goto remove_em;
}
- split->start = start + len;
- split->len = em_end - (start + len);
+ split->start = end;
+ split->len = em_end - end;
split->block_start = em->block_start;
split->flags = flags;
split->compress_type = em->compress_type;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 045ddce32eca..f169378e2ca6 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1515,9 +1515,13 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
if (prev_bit == 0 && bit == 1) {
extent_start = offset;
} else if (prev_bit == 1 && bit == 0) {
- total_found += add_new_free_space(block_group,
- extent_start,
- offset);
+ u64 space_added;
+
+ ret = add_new_free_space(block_group, extent_start,
+ offset, &space_added);
+ if (ret)
+ goto out;
+ total_found += space_added;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
wake_up(&caching_ctl->wait);
@@ -1529,8 +1533,9 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
}
}
if (prev_bit == 1) {
- total_found += add_new_free_space(block_group, extent_start,
- end);
+ ret = add_new_free_space(block_group, extent_start, end, NULL);
+ if (ret)
+ goto out;
extent_count++;
}
@@ -1569,6 +1574,8 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
end = block_group->start + block_group->length;
while (1) {
+ u64 space_added;
+
ret = btrfs_next_item(root, path);
if (ret < 0)
goto out;
@@ -1583,8 +1590,11 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
- total_found += add_new_free_space(block_group, key.objectid,
- key.objectid + key.offset);
+ ret = add_new_free_space(block_group, key.objectid,
+ key.objectid + key.offset, &space_added);
+ if (ret)
+ goto out;
+ total_found += space_added;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
wake_up(&caching_ctl->wait);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0964c66411a1..4e987fe6d648 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1654,8 +1654,6 @@ out_unlock:
clear_bits,
page_ops);
start += cur_alloc_size;
- if (start >= end)
- return ret;
}
/*
@@ -1664,9 +1662,11 @@ out_unlock:
* space_info's bytes_may_use counter, reserved in
* btrfs_check_data_free_space().
*/
- extent_clear_unlock_delalloc(inode, start, end, locked_page,
- clear_bits | EXTENT_CLEAR_DATA_RESV,
- page_ops);
+ if (start < end) {
+ clear_bits |= EXTENT_CLEAR_DATA_RESV;
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
+ clear_bits, page_ops);
+ }
return ret;
}
@@ -3482,15 +3482,21 @@ zeroit:
void btrfs_add_delayed_iput(struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ unsigned long flags;
if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
return;
atomic_inc(&fs_info->nr_delayed_iputs);
- spin_lock(&fs_info->delayed_iput_lock);
+ /*
+ * Need to be irq safe here because we can be called from either an irq
+ * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
+ * context.
+ */
+ spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
ASSERT(list_empty(&inode->delayed_iput));
list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
- spin_unlock(&fs_info->delayed_iput_lock);
+ spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
}
@@ -3499,37 +3505,46 @@ static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
struct btrfs_inode *inode)
{
list_del_init(&inode->delayed_iput);
- spin_unlock(&fs_info->delayed_iput_lock);
+ spin_unlock_irq(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
wake_up(&fs_info->delayed_iputs_wait);
- spin_lock(&fs_info->delayed_iput_lock);
+ spin_lock_irq(&fs_info->delayed_iput_lock);
}
static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
struct btrfs_inode *inode)
{
if (!list_empty(&inode->delayed_iput)) {
- spin_lock(&fs_info->delayed_iput_lock);
+ spin_lock_irq(&fs_info->delayed_iput_lock);
if (!list_empty(&inode->delayed_iput))
run_delayed_iput_locked(fs_info, inode);
- spin_unlock(&fs_info->delayed_iput_lock);
+ spin_unlock_irq(&fs_info->delayed_iput_lock);
}
}
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
{
-
- spin_lock(&fs_info->delayed_iput_lock);
+ /*
+ * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
+ * calls btrfs_add_delayed_iput() and that needs to lock
+ * fs_info->delayed_iput_lock. So we need to disable irqs here to
+ * prevent a deadlock.
+ */
+ spin_lock_irq(&fs_info->delayed_iput_lock);
while (!list_empty(&fs_info->delayed_iputs)) {
struct btrfs_inode *inode;
inode = list_first_entry(&fs_info->delayed_iputs,
struct btrfs_inode, delayed_iput);
run_delayed_iput_locked(fs_info, inode);
- cond_resched_lock(&fs_info->delayed_iput_lock);
+ if (need_resched()) {
+ spin_unlock_irq(&fs_info->delayed_iput_lock);
+ cond_resched();
+ spin_lock_irq(&fs_info->delayed_iput_lock);
+ }
}
- spin_unlock(&fs_info->delayed_iput_lock);
+ spin_unlock_irq(&fs_info->delayed_iput_lock);
}
/*
@@ -3659,11 +3674,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
inode = btrfs_iget(fs_info->sb, last_objectid, root);
- ret = PTR_ERR_OR_ZERO(inode);
- if (ret && ret != -ENOENT)
- goto out;
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
+ if (ret != -ENOENT)
+ goto out;
+ }
- if (ret == -ENOENT && root == fs_info->tree_root) {
+ if (!inode && root == fs_info->tree_root) {
struct btrfs_root *dead_root;
int is_dead_root = 0;
@@ -3724,17 +3742,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* deleted but wasn't. The inode number may have been reused,
* but either way, we can delete the orphan item.
*/
- if (ret == -ENOENT || inode->i_nlink) {
- if (!ret) {
+ if (!inode || inode->i_nlink) {
+ if (inode) {
ret = btrfs_drop_verity_items(BTRFS_I(inode));
iput(inode);
+ inode = NULL;
if (ret)
goto out;
}
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- iput(inode);
goto out;
}
btrfs_debug(fs_info, "auto deleting %Lu",
@@ -3742,10 +3760,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
ret = btrfs_del_orphan_item(trans, root,
found_key.objectid);
btrfs_end_transaction(trans);
- if (ret) {
- iput(inode);
+ if (ret)
goto out;
- }
continue;
}
@@ -4845,9 +4861,6 @@ again:
ret = -ENOMEM;
goto out;
}
- ret = set_page_extent_mapped(page);
- if (ret < 0)
- goto out_unlock;
if (!PageUptodate(page)) {
ret = btrfs_read_folio(NULL, page_folio(page));
@@ -4862,6 +4875,17 @@ again:
goto out_unlock;
}
}
+
+ /*
+ * We unlock the page after the io is completed and then re-lock it
+ * above. release_folio() could have come in between that and cleared
+ * PagePrivate(), but left the page in the mapping. Set the page mapped
+ * here to make sure it's properly set for the subpage stuff.
+ */
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto out_unlock;
+
wait_on_page_writeback(page);
lock_extent(io_tree, block_start, block_end, &cached_state);
@@ -5845,6 +5869,74 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
}
/*
+ * Find the highest existing sequence number in a directory and then set the
+ * in-memory index_cnt variable to the first free sequence number.
+ */
+static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_key key, found_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ /* FIXME: we should be able to handle this */
+ if (ret == 0)
+ goto out;
+ ret = 0;
+
+ if (path->slots[0] == 0) {
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
+ goto out;
+ }
+
+ path->slots[0]--;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid != btrfs_ino(inode) ||
+ found_key.type != BTRFS_DIR_INDEX_KEY) {
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
+ goto out;
+ }
+
+ inode->index_cnt = found_key.offset + 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
+{
+ if (dir->index_cnt == (u64)-1) {
+ int ret;
+
+ ret = btrfs_inode_delayed_dir_index_count(dir);
+ if (ret) {
+ ret = btrfs_set_inode_index_count(dir);
+ if (ret)
+ return ret;
+ }
+ }
+
+ *index = dir->index_cnt;
+
+ return 0;
+}
+
+/*
* All this infrastructure exists because dir_emit can fault, and we are holding
* the tree lock when doing readdir. For now just allocate a buffer and copy
* our information into that, and then dir_emit from the buffer. This is
@@ -5856,10 +5948,17 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
static int btrfs_opendir(struct inode *inode, struct file *file)
{
struct btrfs_file_private *private;
+ u64 last_index;
+ int ret;
+
+ ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
+ if (ret)
+ return ret;
private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
if (!private)
return -ENOMEM;
+ private->last_index = last_index;
private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!private->filldir_buf) {
kfree(private);
@@ -5926,7 +6025,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
INIT_LIST_HEAD(&ins_list);
INIT_LIST_HEAD(&del_list);
- put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
+ put = btrfs_readdir_get_delayed_items(inode, private->last_index,
+ &ins_list, &del_list);
again:
key.type = BTRFS_DIR_INDEX_KEY;
@@ -5944,6 +6044,8 @@ again:
break;
if (found_key.offset < ctx->pos)
continue;
+ if (found_key.offset > private->last_index)
+ break;
if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
continue;
di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
@@ -6072,57 +6174,6 @@ static int btrfs_update_time(struct inode *inode, int flags)
}
/*
- * find the highest existing sequence number in a directory
- * and then set the in-memory index_cnt variable to reflect
- * free sequence numbers
- */
-static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_key key, found_key;
- struct btrfs_path *path;
- struct extent_buffer *leaf;
- int ret;
-
- key.objectid = btrfs_ino(inode);
- key.type = BTRFS_DIR_INDEX_KEY;
- key.offset = (u64)-1;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- /* FIXME: we should be able to handle this */
- if (ret == 0)
- goto out;
- ret = 0;
-
- if (path->slots[0] == 0) {
- inode->index_cnt = BTRFS_DIR_START_INDEX;
- goto out;
- }
-
- path->slots[0]--;
-
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
- if (found_key.objectid != btrfs_ino(inode) ||
- found_key.type != BTRFS_DIR_INDEX_KEY) {
- inode->index_cnt = BTRFS_DIR_START_INDEX;
- goto out;
- }
-
- inode->index_cnt = found_key.offset + 1;
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-/*
* helper to find a free sequence number in a given directory. This current
* code is very simple, later versions will do smarter things in the btree
*/
@@ -7834,8 +7885,11 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
if (ret) {
- bbio->bio.bi_status = errno_to_blk_status(ret);
- btrfs_dio_end_io(bbio);
+ btrfs_finish_ordered_extent(dio_data->ordered, NULL,
+ file_offset, dip->bytes,
+ !ret);
+ bio->bi_status = errno_to_blk_status(ret);
+ iomap_dio_bio_end_io(bio);
return;
}
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index da1f84a0eb29..2637d6b157ff 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -4445,4 +4445,5 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
ulist_free(entry->old_roots);
kfree(entry);
}
+ *root = RB_ROOT;
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index f37b925d587f..0249ea52bb80 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -71,7 +71,7 @@ static void rmw_rbio_work_locked(struct work_struct *work);
static void index_rbio_pages(struct btrfs_raid_bio *rbio);
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
-static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check);
+static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
static void scrub_rbio_work_locked(struct work_struct *work);
static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
@@ -2404,7 +2404,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
return 0;
}
-static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
+static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
{
struct btrfs_io_context *bioc = rbio->bioc;
const u32 sectorsize = bioc->fs_info->sectorsize;
@@ -2445,9 +2445,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
*/
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- if (!need_check)
- goto writeback;
-
p_sector.page = alloc_page(GFP_NOFS);
if (!p_sector.page)
return -ENOMEM;
@@ -2516,7 +2513,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
q_sector.page = NULL;
}
-writeback:
/*
* time to start writing. Make bios for everything from the
* higher layers (the bio_list in our rbio) and our p/q. Ignore
@@ -2699,7 +2695,6 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
static void scrub_rbio(struct btrfs_raid_bio *rbio)
{
- bool need_check = false;
int sector_nr;
int ret;
@@ -2722,7 +2717,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio)
* We have every sector properly prepared. Can finish the scrub
* and writeback the good content.
*/
- ret = finish_parity_scrub(rbio, need_check);
+ ret = finish_parity_scrub(rbio);
wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
int found_errors;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 25a3361caedc..46c3c1d57266 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1916,7 +1916,39 @@ again:
err = PTR_ERR(root);
break;
}
- ASSERT(root->reloc_root == reloc_root);
+
+ if (unlikely(root->reloc_root != reloc_root)) {
+ if (root->reloc_root) {
+ btrfs_err(fs_info,
+"reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu",
+ root->root_key.objectid,
+ root->reloc_root->root_key.objectid,
+ root->reloc_root->root_key.type,
+ root->reloc_root->root_key.offset,
+ btrfs_root_generation(
+ &root->reloc_root->root_item),
+ reloc_root->root_key.objectid,
+ reloc_root->root_key.type,
+ reloc_root->root_key.offset,
+ btrfs_root_generation(
+ &reloc_root->root_item));
+ } else {
+ btrfs_err(fs_info,
+"reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu",
+ root->root_key.objectid,
+ reloc_root->root_key.objectid,
+ reloc_root->root_key.type,
+ reloc_root->root_key.offset,
+ btrfs_root_generation(
+ &reloc_root->root_item));
+ }
+ list_add(&reloc_root->root_list, &reloc_roots);
+ btrfs_put_root(root);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ if (!err)
+ err = -EUCLEAN;
+ break;
+ }
/*
* set reference count to 1, so btrfs_recover_relocation
@@ -1989,7 +2021,7 @@ again:
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
- if (IS_ERR(root)) {
+ if (WARN_ON(IS_ERR(root))) {
/*
* For recovery we read the fs roots on mount,
* and if we didn't find the root then we marked
@@ -1998,17 +2030,14 @@ again:
* memory. However there's no reason we can't
* handle the error properly here just in case.
*/
- ASSERT(0);
ret = PTR_ERR(root);
goto out;
}
- if (root->reloc_root != reloc_root) {
+ if (WARN_ON(root->reloc_root != reloc_root)) {
/*
- * This is actually impossible without something
- * going really wrong (like weird race condition
- * or cosmic rays).
+ * This can happen if on-disk metadata has some
+ * corruption, e.g. bad reloc tree key offset.
*/
- ASSERT(0);
ret = -EINVAL;
goto out;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4cae41bd6de0..7289f5bff397 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -605,7 +605,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
btrfs_stack_header_bytenr(header), logical);
return;
}
- if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) {
+ if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) != 0) {
bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f6e4016a343e..a10e38c2609f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -826,8 +826,13 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
trans = start_transaction(root, 0, TRANS_ATTACH,
BTRFS_RESERVE_NO_FLUSH, true);
- if (trans == ERR_PTR(-ENOENT))
- btrfs_wait_for_commit(root->fs_info, 0);
+ if (trans == ERR_PTR(-ENOENT)) {
+ int ret;
+
+ ret = btrfs_wait_for_commit(root->fs_info, 0);
+ if (ret)
+ return ERR_PTR(ret);
+ }
return trans;
}
@@ -931,6 +936,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
}
wait_for_commit(cur_trans, TRANS_STATE_COMPLETED);
+ ret = cur_trans->aborted;
btrfs_put_transaction(cur_trans);
out:
return ret;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 038dfa8f1788..ab08a0b01311 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -446,6 +446,20 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
btrfs_item_key_to_cpu(leaf, &item_key, slot);
is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY);
+ /*
+ * Bad rootid for reloc trees.
+ *
+ * Reloc trees are only for subvolume trees, other trees only need
+ * to be COWed to be relocated.
+ */
+ if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
+ !is_fstree(key->offset))) {
+ generic_err(leaf, slot,
+ "invalid reloc tree for root %lld, root id is not a subvolume tree",
+ key->offset);
+ return -EUCLEAN;
+ }
+
/* No such tree id */
if (unlikely(key->objectid == 0)) {
if (is_root_item)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 264c71590370..f6718999d183 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4076,14 +4076,6 @@ static int alloc_profile_is_valid(u64 flags, int extended)
return has_single_bit_set(flags);
}
-static inline int balance_need_close(struct btrfs_fs_info *fs_info)
-{
- /* cancel requested || normal exit path */
- return atomic_read(&fs_info->balance_cancel_req) ||
- (atomic_read(&fs_info->balance_pause_req) == 0 &&
- atomic_read(&fs_info->balance_cancel_req) == 0);
-}
-
/*
* Validate target profile against allowed profiles and return true if it's OK.
* Otherwise print the error message and return false.
@@ -4273,6 +4265,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
u64 num_devices;
unsigned seq;
bool reducing_redundancy;
+ bool paused = false;
int i;
if (btrfs_fs_closing(fs_info) ||
@@ -4403,6 +4396,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
btrfs_info(fs_info, "balance: paused");
btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+ paused = true;
}
/*
* Balance can be canceled by:
@@ -4431,8 +4425,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
btrfs_update_ioctl_balance_args(fs_info, bargs);
}
- if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
- balance_need_close(fs_info)) {
+ /* We didn't pause, we can clean everything up. */
+ if (!paused) {
reset_balance_state(fs_info);
btrfs_exclop_finish(fs_info);
}
@@ -4642,8 +4636,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
}
}
- BUG_ON(fs_info->balance_ctl ||
- test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+ ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
atomic_dec(&fs_info->balance_cancel_req);
mutex_unlock(&fs_info->balance_mutex);
return 0;
@@ -6402,7 +6395,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
(op == BTRFS_MAP_READ || !dev_replace_is_ongoing ||
!dev_replace->tgtdev)) {
set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
- *mirror_num_ret = mirror_num;
+ if (mirror_num_ret)
+ *mirror_num_ret = mirror_num;
*bioc_ret = NULL;
ret = 0;
goto out;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 85b8b332add9..72b90bc19a19 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -805,6 +805,9 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
return -EINVAL;
}
+ btrfs_clear_and_info(info, DISCARD_ASYNC,
+ "zoned: async discard ignored and disabled for zoned mode");
+
return 0;
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4a2b39d9a61a..bdcffb04513f 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -2019,9 +2019,10 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
}
}
+WRAP_DIR_ITER(ceph_readdir) // FIXME!
const struct file_operations ceph_dir_fops = {
.read = ceph_read_dir,
- .iterate = ceph_readdir,
+ .iterate_shared = shared_ceph_readdir,
.llseek = ceph_dir_llseek,
.open = ceph_open,
.release = ceph_release,
@@ -2033,7 +2034,7 @@ const struct file_operations ceph_dir_fops = {
};
const struct file_operations ceph_snapdir_fops = {
- .iterate = ceph_readdir,
+ .iterate_shared = shared_ceph_readdir,
.llseek = ceph_dir_llseek,
.open = ceph_open,
.release = ceph_release,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 66048a86c480..5fb367b1d4b0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4764,7 +4764,7 @@ static void delayed_work(struct work_struct *work)
dout("mdsc delayed_work\n");
- if (mdsc->stopping)
+ if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
return;
mutex_lock(&mdsc->mutex);
@@ -4943,7 +4943,7 @@ void send_flush_mdlog(struct ceph_mds_session *s)
void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
{
dout("pre_umount\n");
- mdsc->stopping = 1;
+ mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 724307ff89cd..86d2965e68a1 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -380,6 +380,11 @@ struct cap_wait {
int want;
};
+enum {
+ CEPH_MDSC_STOPPING_BEGIN = 1,
+ CEPH_MDSC_STOPPING_FLUSHED = 2,
+};
+
/*
* mds client state
*/
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index cce78d769f55..6d3584f16f9a 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -216,7 +216,7 @@ static void metric_delayed_work(struct work_struct *work)
struct ceph_mds_client *mdsc =
container_of(m, struct ceph_mds_client, metric);
- if (mdsc->stopping)
+ if (mdsc->stopping || disable_send_metrics)
return;
if (!m->session || !check_session_state(m->session)) {
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 3fc48b43cab0..a5f52013314d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1374,6 +1374,16 @@ static void ceph_kill_sb(struct super_block *s)
ceph_mdsc_pre_umount(fsc->mdsc);
flush_fs_workqueues(fsc);
+ /*
+ * Though the kill_anon_super() will finally trigger the
+ * sync_filesystem() anyway, we still need to do it here
+ * and then bump the stage of shutdown to stop the work
+ * queue as earlier as possible.
+ */
+ sync_filesystem(s);
+
+ fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
+
kill_anon_super(s);
fsc->client->extra_mon_dispatch = NULL;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 1d4f40048efc..cb512b10473b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -429,21 +429,14 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
cfi = coda_ftoc(coda_file);
host_file = cfi->cfi_container;
- if (host_file->f_op->iterate || host_file->f_op->iterate_shared) {
+ if (host_file->f_op->iterate_shared) {
struct inode *host_inode = file_inode(host_file);
ret = -ENOENT;
if (!IS_DEADDIR(host_inode)) {
- if (host_file->f_op->iterate_shared) {
- inode_lock_shared(host_inode);
- ret = host_file->f_op->iterate_shared(host_file, ctx);
- file_accessed(host_file);
- inode_unlock_shared(host_inode);
- } else {
- inode_lock(host_inode);
- ret = host_file->f_op->iterate(host_file, ctx);
- file_accessed(host_file);
- inode_unlock(host_inode);
- }
+ inode_lock_shared(host_inode);
+ ret = host_file->f_op->iterate_shared(host_file, ctx);
+ file_accessed(host_file);
+ inode_unlock_shared(host_inode);
}
return ret;
}
@@ -585,10 +578,11 @@ const struct inode_operations coda_dir_inode_operations = {
.setattr = coda_setattr,
};
+WRAP_DIR_ITER(coda_readdir) // FIXME!
const struct file_operations coda_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = coda_readdir,
+ .iterate_shared = shared_coda_readdir,
.open = coda_open,
.release = coda_release,
.fsync = coda_fsync,
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 2a29943fa5cc..cfad1eac7fd9 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -148,7 +148,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
*maptype = 0;
return inpage;
}
- kunmap_atomic(inpage);
+ kunmap_local(inpage);
might_sleep();
src = erofs_vm_map_ram(rq->in, ctx->inpages);
if (!src)
@@ -162,7 +162,7 @@ docopy:
src = erofs_get_pcpubuf(ctx->inpages);
if (!src) {
DBG_BUGON(1);
- kunmap_atomic(inpage);
+ kunmap_local(inpage);
return ERR_PTR(-EFAULT);
}
@@ -173,9 +173,9 @@ docopy:
min_t(unsigned int, total, PAGE_SIZE - *inputmargin);
if (!inpage)
- inpage = kmap_atomic(*in);
+ inpage = kmap_local_page(*in);
memcpy(tmp, inpage + *inputmargin, page_copycnt);
- kunmap_atomic(inpage);
+ kunmap_local(inpage);
inpage = NULL;
tmp += page_copycnt;
total -= page_copycnt;
@@ -214,7 +214,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
int ret, maptype;
DBG_BUGON(*rq->in == NULL);
- headpage = kmap_atomic(*rq->in);
+ headpage = kmap_local_page(*rq->in);
/* LZ4 decompression inplace is only safe if zero_padding is enabled */
if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) {
@@ -223,7 +223,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
min_t(unsigned int, rq->inputsize,
rq->sb->s_blocksize - rq->pageofs_in));
if (ret) {
- kunmap_atomic(headpage);
+ kunmap_local(headpage);
return ret;
}
may_inplace = !((rq->pageofs_in + rq->inputsize) &
@@ -261,7 +261,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
}
if (maptype == 0) {
- kunmap_atomic(headpage);
+ kunmap_local(headpage);
} else if (maptype == 1) {
vm_unmap_ram(src, ctx->inpages);
} else if (maptype == 2) {
@@ -289,7 +289,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
/* one optimized fast path only for non bigpcluster cases yet */
if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) {
DBG_BUGON(!*rq->out);
- dst = kmap_atomic(*rq->out);
+ dst = kmap_local_page(*rq->out);
dst_maptype = 0;
goto dstmap_out;
}
@@ -311,7 +311,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
dstmap_out:
ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out);
if (!dst_maptype)
- kunmap_atomic(dst);
+ kunmap_local(dst);
else if (dst_maptype == 2)
vm_unmap_ram(dst, ctx.outpages);
return ret;
@@ -328,7 +328,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
const unsigned int lefthalf = rq->outputsize - righthalf;
const unsigned int interlaced_offset =
rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
- unsigned char *src, *dst;
+ u8 *src;
if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
DBG_BUGON(1);
@@ -341,22 +341,19 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
}
src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
- if (rq->out[0]) {
- dst = kmap_local_page(rq->out[0]);
- memcpy(dst + rq->pageofs_out, src + interlaced_offset,
- righthalf);
- kunmap_local(dst);
- }
+ if (rq->out[0])
+ memcpy_to_page(rq->out[0], rq->pageofs_out,
+ src + interlaced_offset, righthalf);
if (outpages > inpages) {
DBG_BUGON(!rq->out[outpages - 1]);
if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
- dst = kmap_local_page(rq->out[outpages - 1]);
- memcpy(dst, interlaced_offset ? src :
- (src + righthalf), lefthalf);
- kunmap_local(dst);
+ memcpy_to_page(rq->out[outpages - 1], 0, src +
+ (interlaced_offset ? 0 : righthalf),
+ lefthalf);
} else if (!interlaced_offset) {
memmove(src, src + righthalf, lefthalf);
+ flush_dcache_page(rq->in[inpages - 1]);
}
}
kunmap_local(src);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index f3053f0dd6e1..edc8ec7581b8 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -179,7 +179,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
inode->i_flags &= ~S_DAX;
if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
- vi->datalayout == EROFS_INODE_FLAT_PLAIN)
+ (vi->datalayout == EROFS_INODE_FLAT_PLAIN ||
+ vi->datalayout == EROFS_INODE_CHUNK_BASED))
inode->i_flags |= S_DAX;
if (!nblks)
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 9d6a3c6158bd..566f68ddfa36 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -889,8 +889,6 @@ static void erofs_kill_sb(struct super_block *sb)
{
struct erofs_sb_info *sbi;
- WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
-
/* pseudo mount for anon inodes */
if (sb->s_flags & SB_KERNMOUNT) {
kill_anon_super(sb);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 5f1890e309c6..de4f12152b62 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1035,7 +1035,7 @@ hitted:
*/
tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
- cur = end - min_t(unsigned int, offset + end - map->m_la, end);
+ cur = end - min_t(erofs_off_t, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
zero_user_segment(page, cur, end);
goto next_part;
@@ -1144,10 +1144,11 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
struct z_erofs_bvec *bvec)
{
struct z_erofs_bvec_item *item;
+ unsigned int pgnr;
- if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) {
- unsigned int pgnr;
-
+ if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) &&
+ (bvec->end == PAGE_SIZE ||
+ bvec->offset + bvec->end == be->pcl->length)) {
pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
DBG_BUGON(pgnr >= be->nr_pages);
if (!be->decompressed_pages[pgnr]) {
@@ -1841,7 +1842,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
}
cur = map->m_la + map->m_llen - 1;
- while (cur >= end) {
+ while ((cur >= end) && (cur < i_size_read(inode))) {
pgoff_t index = cur >> PAGE_SHIFT;
struct page *page;
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index 9f42f25fab92..e918decb3735 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -69,7 +69,7 @@ static int exfat_allocate_bitmap(struct super_block *sb,
}
sbi->map_sectors = ((need_map_size - 1) >>
(sb->s_blocksize_bits)) + 1;
- sbi->vol_amap = kmalloc_array(sbi->map_sectors,
+ sbi->vol_amap = kvmalloc_array(sbi->map_sectors,
sizeof(struct buffer_head *), GFP_KERNEL);
if (!sbi->vol_amap)
return -ENOMEM;
@@ -84,7 +84,7 @@ static int exfat_allocate_bitmap(struct super_block *sb,
while (j < i)
brelse(sbi->vol_amap[j++]);
- kfree(sbi->vol_amap);
+ kvfree(sbi->vol_amap);
sbi->vol_amap = NULL;
return -EIO;
}
@@ -138,7 +138,7 @@ void exfat_free_bitmap(struct exfat_sb_info *sbi)
for (i = 0; i < sbi->map_sectors; i++)
__brelse(sbi->vol_amap[i]);
- kfree(sbi->vol_amap);
+ kvfree(sbi->vol_amap);
}
int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync)
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 957574180a5e..e1586bba6d86 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -34,6 +34,7 @@ static int exfat_get_uniname_from_ext_entry(struct super_block *sb,
{
int i, err;
struct exfat_entry_set_cache es;
+ unsigned int uni_len = 0, len;
err = exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES);
if (err)
@@ -52,7 +53,10 @@ static int exfat_get_uniname_from_ext_entry(struct super_block *sb,
if (exfat_get_entry_type(ep) != TYPE_EXTEND)
break;
- exfat_extract_uni_name(ep, uniname);
+ len = exfat_extract_uni_name(ep, uniname);
+ uni_len += len;
+ if (len != EXFAT_FILE_NAME_LEN || uni_len >= MAX_NAME_LENGTH)
+ break;
uniname += EXFAT_FILE_NAME_LEN;
}
@@ -214,7 +218,10 @@ static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb)
exfat_init_namebuf(nb);
}
-/* skip iterating emit_dots when dir is empty */
+/*
+ * Before calling dir_emit*(), sbi->s_lock should be released
+ * because page fault can occur in dir_emit*().
+ */
#define ITER_POS_FILLED_DOTS (2)
static int exfat_iterate(struct file *file, struct dir_context *ctx)
{
@@ -229,11 +236,10 @@ static int exfat_iterate(struct file *file, struct dir_context *ctx)
int err = 0, fake_offset = 0;
exfat_init_namebuf(nb);
- mutex_lock(&EXFAT_SB(sb)->s_lock);
cpos = ctx->pos;
if (!dir_emit_dots(file, ctx))
- goto unlock;
+ goto out;
if (ctx->pos == ITER_POS_FILLED_DOTS) {
cpos = 0;
@@ -245,16 +251,18 @@ static int exfat_iterate(struct file *file, struct dir_context *ctx)
/* name buffer should be allocated before use */
err = exfat_alloc_namebuf(nb);
if (err)
- goto unlock;
+ goto out;
get_new:
+ mutex_lock(&EXFAT_SB(sb)->s_lock);
+
if (ei->flags == ALLOC_NO_FAT_CHAIN && cpos >= i_size_read(inode))
goto end_of_dir;
err = exfat_readdir(inode, &cpos, &de);
if (err) {
/*
- * At least we tried to read a sector. Move cpos to next sector
- * position (should be aligned).
+ * At least we tried to read a sector.
+ * Move cpos to next sector position (should be aligned).
*/
if (err == -EIO) {
cpos += 1 << (sb->s_blocksize_bits);
@@ -277,16 +285,10 @@ get_new:
inum = iunique(sb, EXFAT_ROOT_INO);
}
- /*
- * Before calling dir_emit(), sb_lock should be released.
- * Because page fault can occur in dir_emit() when the size
- * of buffer given from user is larger than one page size.
- */
mutex_unlock(&EXFAT_SB(sb)->s_lock);
if (!dir_emit(ctx, nb->lfn, strlen(nb->lfn), inum,
(de.attr & ATTR_SUBDIR) ? DT_DIR : DT_REG))
- goto out_unlocked;
- mutex_lock(&EXFAT_SB(sb)->s_lock);
+ goto out;
ctx->pos = cpos;
goto get_new;
@@ -294,9 +296,8 @@ end_of_dir:
if (!cpos && fake_offset)
cpos = ITER_POS_FILLED_DOTS;
ctx->pos = cpos;
-unlock:
mutex_unlock(&EXFAT_SB(sb)->s_lock);
-out_unlocked:
+out:
/*
* To improve performance, free namebuf after unlock sb_lock.
* If namebuf is not allocated, this function do nothing
@@ -305,10 +306,11 @@ out_unlocked:
return err;
}
+WRAP_DIR_ITER(exfat_iterate) // FIXME!
const struct file_operations exfat_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = exfat_iterate,
+ .iterate_shared = shared_exfat_iterate,
.unlocked_ioctl = exfat_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = exfat_compat_ioctl,
@@ -1079,7 +1081,8 @@ rewind:
if (entry_type == TYPE_EXTEND) {
unsigned short entry_uniname[16], unichar;
- if (step != DIRENT_STEP_NAME) {
+ if (step != DIRENT_STEP_NAME ||
+ name_len >= MAX_NAME_LENGTH) {
step = DIRENT_STEP_FILE;
continue;
}
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 40e624cf7e92..d1dbe47c7975 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -315,7 +315,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
goto out;
error = -EINVAL;
- if (!file->f_op->iterate && !file->f_op->iterate_shared)
+ if (!file->f_op->iterate_shared)
goto out_close;
buffer.sequence = 0;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a2475b8c9fb5..21b903fe546e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1006,14 +1006,11 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
* fls() instead since we need to know the actual length while modifying
* goal length.
*/
- order = fls(ac->ac_g_ex.fe_len);
+ order = fls(ac->ac_g_ex.fe_len) - 1;
min_order = order - sbi->s_mb_best_avail_max_trim_order;
if (min_order < 0)
min_order = 0;
- if (1 << min_order < ac->ac_o_ex.fe_len)
- min_order = fls(ac->ac_o_ex.fe_len) + 1;
-
if (sbi->s_stripe > 0) {
/*
* We are assuming that stripe size is always a multiple of
@@ -1021,9 +1018,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
*/
num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
if (1 << min_order < num_stripe_clusters)
- min_order = fls(num_stripe_clusters);
+ /*
+ * We consider 1 order less because later we round
+ * up the goal len to num_stripe_clusters
+ */
+ min_order = fls(num_stripe_clusters) - 1;
}
+ if (1 << min_order < ac->ac_o_ex.fe_len)
+ min_order = fls(ac->ac_o_ex.fe_len);
+
for (i = order; i >= min_order; i--) {
int frag_order;
/*
@@ -4761,8 +4765,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
- struct ext4_prealloc_space *tmp_pa, *cpa = NULL;
- ext4_lblk_t tmp_pa_start, tmp_pa_end;
+ struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL;
+ loff_t tmp_pa_end;
struct rb_node *iter;
ext4_fsblk_t goal_block;
@@ -4770,47 +4774,151 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return false;
- /* first, try per-file preallocation */
+ /*
+ * first, try per-file preallocation by searching the inode pa rbtree.
+ *
+ * Here, we can't do a direct traversal of the tree because
+ * ext4_mb_discard_group_preallocation() can paralelly mark the pa
+ * deleted and that can cause direct traversal to skip some entries.
+ */
read_lock(&ei->i_prealloc_lock);
+
+ if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) {
+ goto try_group_pa;
+ }
+
+ /*
+ * Step 1: Find a pa with logical start immediately adjacent to the
+ * original logical start. This could be on the left or right.
+ *
+ * (tmp_pa->pa_lstart never changes so we can skip locking for it).
+ */
for (iter = ei->i_prealloc_node.rb_node; iter;
iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
- tmp_pa_start, iter)) {
+ tmp_pa->pa_lstart, iter)) {
tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
pa_node.inode_node);
+ }
- /* all fields in this condition don't change,
- * so we can skip locking for them */
- tmp_pa_start = tmp_pa->pa_lstart;
- tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
-
- /* original request start doesn't lie in this PA */
- if (ac->ac_o_ex.fe_logical < tmp_pa_start ||
- ac->ac_o_ex.fe_logical >= tmp_pa_end)
- continue;
+ /*
+ * Step 2: The adjacent pa might be to the right of logical start, find
+ * the left adjacent pa. After this step we'd have a valid tmp_pa whose
+ * logical start is towards the left of original request's logical start
+ */
+ if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+ struct rb_node *tmp;
+ tmp = rb_prev(&tmp_pa->pa_node.inode_node);
- /* non-extent files can't have physical blocks past 2^32 */
- if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
- (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
- EXT4_MAX_BLOCK_FILE_PHYS)) {
+ if (tmp) {
+ tmp_pa = rb_entry(tmp, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ } else {
/*
- * Since PAs don't overlap, we won't find any
- * other PA to satisfy this.
+ * If there is no adjacent pa to the left then finding
+ * an overlapping pa is not possible hence stop searching
+ * inode pa tree
*/
- break;
+ goto try_group_pa;
}
+ }
+
+ BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
- /* found preallocated blocks, use them */
+ /*
+ * Step 3: If the left adjacent pa is deleted, keep moving left to find
+ * the first non deleted adjacent pa. After this step we should have a
+ * valid tmp_pa which is guaranteed to be non deleted.
+ */
+ for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) {
+ if (!iter) {
+ /*
+ * no non deleted left adjacent pa, so stop searching
+ * inode pa tree
+ */
+ goto try_group_pa;
+ }
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
spin_lock(&tmp_pa->pa_lock);
- if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free &&
- likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
- atomic_inc(&tmp_pa->pa_count);
- ext4_mb_use_inode_pa(ac, tmp_pa);
+ if (tmp_pa->pa_deleted == 0) {
+ /*
+ * We will keep holding the pa_lock from
+ * this point on because we don't want group discard
+ * to delete this pa underneath us. Since group
+ * discard is anyways an ENOSPC operation it
+ * should be okay for it to wait a few more cycles.
+ */
+ break;
+ } else {
spin_unlock(&tmp_pa->pa_lock);
- read_unlock(&ei->i_prealloc_lock);
- return true;
}
+ }
+
+ BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
+ BUG_ON(tmp_pa->pa_deleted == 1);
+
+ /*
+ * Step 4: We now have the non deleted left adjacent pa. Only this
+ * pa can possibly satisfy the request hence check if it overlaps
+ * original logical start and stop searching if it doesn't.
+ */
+ tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+
+ if (ac->ac_o_ex.fe_logical >= tmp_pa_end) {
spin_unlock(&tmp_pa->pa_lock);
+ goto try_group_pa;
+ }
+
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
+ (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
+ EXT4_MAX_BLOCK_FILE_PHYS)) {
+ /*
+ * Since PAs don't overlap, we won't find any other PA to
+ * satisfy this.
+ */
+ spin_unlock(&tmp_pa->pa_lock);
+ goto try_group_pa;
+ }
+
+ if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
+ atomic_inc(&tmp_pa->pa_count);
+ ext4_mb_use_inode_pa(ac, tmp_pa);
+ spin_unlock(&tmp_pa->pa_lock);
+ read_unlock(&ei->i_prealloc_lock);
+ return true;
+ } else {
+ /*
+ * We found a valid overlapping pa but couldn't use it because
+ * it had no free blocks. This should ideally never happen
+ * because:
+ *
+ * 1. When a new inode pa is added to rbtree it must have
+ * pa_free > 0 since otherwise we won't actually need
+ * preallocation.
+ *
+ * 2. An inode pa that is in the rbtree can only have it's
+ * pa_free become zero when another thread calls:
+ * ext4_mb_new_blocks
+ * ext4_mb_use_preallocated
+ * ext4_mb_use_inode_pa
+ *
+ * 3. Further, after the above calls make pa_free == 0, we will
+ * immediately remove it from the rbtree in:
+ * ext4_mb_new_blocks
+ * ext4_mb_release_context
+ * ext4_mb_put_pa
+ *
+ * 4. Since the pa_free becoming 0 and pa_free getting removed
+ * from tree both happen in ext4_mb_new_blocks, which is always
+ * called with i_data_sem held for data allocations, we can be
+ * sure that another process will never see a pa in rbtree with
+ * pa_free == 0.
+ */
+ WARN_ON_ONCE(tmp_pa->pa_free == 0);
}
+ spin_unlock(&tmp_pa->pa_lock);
+try_group_pa:
read_unlock(&ei->i_prealloc_lock);
/* can we use group allocation? */
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index dbe54cddda3d..281e1bfbbe3e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1782,6 +1782,20 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
memmove(here, (void *)here + size,
(void *)last - (void *)here + sizeof(__u32));
memset(last, 0, size);
+
+ /*
+ * Update i_inline_off - moved ibody region might contain
+ * system.data attribute. Handling a failure here won't
+ * cause other complications for setting an xattr.
+ */
+ if (!is_block && ext4_has_inline_data(inode)) {
+ ret = ext4_find_inline_data_nolock(inode);
+ if (ret) {
+ ext4_warning_inode(inode,
+ "unable to update i_inline_off");
+ goto out;
+ }
+ }
} else if (s->not_found) {
/* Insert new name. */
size_t size = EXT4_XATTR_LEN(name_len);
diff --git a/fs/file.c b/fs/file.c
index 7893ea161d77..3fd003a8604f 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1036,16 +1036,30 @@ unsigned long __fdget_raw(unsigned int fd)
return __fget_light(fd, 0);
}
+/*
+ * Try to avoid f_pos locking. We only need it if the
+ * file is marked for FMODE_ATOMIC_POS, and it can be
+ * accessed multiple ways.
+ *
+ * Always do it for directories, because pidfd_getfd()
+ * can make a file accessible even if it otherwise would
+ * not be, and for directories this is a correctness
+ * issue, not a "POSIX requirement".
+ */
+static inline bool file_needs_f_pos_lock(struct file *file)
+{
+ return (file->f_mode & FMODE_ATOMIC_POS) &&
+ (file_count(file) > 1 || file->f_op->iterate_shared);
+}
+
unsigned long __fdget_pos(unsigned int fd)
{
unsigned long v = __fdget(fd);
struct file *file = (struct file *)(v & ~3);
- if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
- if (file_count(file) > 1) {
- v |= FDPUT_POS_UNLOCK;
- mutex_lock(&file->f_pos_lock);
- }
+ if (file && file_needs_f_pos_lock(file)) {
+ v |= FDPUT_POS_UNLOCK;
+ mutex_lock(&file->f_pos_lock);
}
return v;
}
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 851214d1d013..30d82d2979af 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -692,6 +692,7 @@ void vfs_clean_context(struct fs_context *fc)
security_free_mnt_opts(&fc->security);
kfree(fc->source);
fc->source = NULL;
+ fc->exclusive = false;
fc->purpose = FS_CONTEXT_FOR_RECONFIGURE;
fc->phase = FS_CONTEXT_AWAITING_RECONF;
diff --git a/fs/fsopen.c b/fs/fsopen.c
index fc9d2d9fd234..ce03f6521c88 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -209,6 +209,72 @@ err:
return ret;
}
+static int vfs_cmd_create(struct fs_context *fc, bool exclusive)
+{
+ struct super_block *sb;
+ int ret;
+
+ if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
+ return -EBUSY;
+
+ if (!mount_capable(fc))
+ return -EPERM;
+
+ /* require the new mount api */
+ if (exclusive && fc->ops == &legacy_fs_context_ops)
+ return -EOPNOTSUPP;
+
+ fc->phase = FS_CONTEXT_CREATING;
+ fc->exclusive = exclusive;
+
+ ret = vfs_get_tree(fc);
+ if (ret) {
+ fc->phase = FS_CONTEXT_FAILED;
+ return ret;
+ }
+
+ sb = fc->root->d_sb;
+ ret = security_sb_kern_mount(sb);
+ if (unlikely(ret)) {
+ fc_drop_locked(fc);
+ fc->phase = FS_CONTEXT_FAILED;
+ return ret;
+ }
+
+ /* vfs_get_tree() callchains will have grabbed @s_umount */
+ up_write(&sb->s_umount);
+ fc->phase = FS_CONTEXT_AWAITING_MOUNT;
+ return 0;
+}
+
+static int vfs_cmd_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb;
+ int ret;
+
+ if (fc->phase != FS_CONTEXT_RECONF_PARAMS)
+ return -EBUSY;
+
+ fc->phase = FS_CONTEXT_RECONFIGURING;
+
+ sb = fc->root->d_sb;
+ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
+ fc->phase = FS_CONTEXT_FAILED;
+ return -EPERM;
+ }
+
+ down_write(&sb->s_umount);
+ ret = reconfigure_super(fc);
+ up_write(&sb->s_umount);
+ if (ret) {
+ fc->phase = FS_CONTEXT_FAILED;
+ return ret;
+ }
+
+ vfs_clean_context(fc);
+ return 0;
+}
+
/*
* Check the state and apply the configuration. Note that this function is
* allowed to 'steal' the value by setting param->xxx to NULL before returning.
@@ -216,7 +282,6 @@ err:
static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
struct fs_parameter *param)
{
- struct super_block *sb;
int ret;
ret = finish_clean_context(fc);
@@ -224,39 +289,11 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
return ret;
switch (cmd) {
case FSCONFIG_CMD_CREATE:
- if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
- return -EBUSY;
- if (!mount_capable(fc))
- return -EPERM;
- fc->phase = FS_CONTEXT_CREATING;
- ret = vfs_get_tree(fc);
- if (ret)
- break;
- sb = fc->root->d_sb;
- ret = security_sb_kern_mount(sb);
- if (unlikely(ret)) {
- fc_drop_locked(fc);
- break;
- }
- up_write(&sb->s_umount);
- fc->phase = FS_CONTEXT_AWAITING_MOUNT;
- return 0;
+ return vfs_cmd_create(fc, false);
+ case FSCONFIG_CMD_CREATE_EXCL:
+ return vfs_cmd_create(fc, true);
case FSCONFIG_CMD_RECONFIGURE:
- if (fc->phase != FS_CONTEXT_RECONF_PARAMS)
- return -EBUSY;
- fc->phase = FS_CONTEXT_RECONFIGURING;
- sb = fc->root->d_sb;
- if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
- ret = -EPERM;
- break;
- }
- down_write(&sb->s_umount);
- ret = reconfigure_super(fc);
- up_write(&sb->s_umount);
- if (ret)
- break;
- vfs_clean_context(fc);
- return 0;
+ return vfs_cmd_reconfigure(fc);
default:
if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
fc->phase != FS_CONTEXT_RECONF_PARAMS)
@@ -264,8 +301,6 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
return vfs_parse_fs_param(fc, param);
}
- fc->phase = FS_CONTEXT_FAILED;
- return ret;
}
/**
@@ -353,6 +388,7 @@ SYSCALL_DEFINE5(fsconfig,
return -EINVAL;
break;
case FSCONFIG_CMD_CREATE:
+ case FSCONFIG_CMD_CREATE_EXCL:
case FSCONFIG_CMD_RECONFIGURE:
if (_key || _value || aux)
return -EINVAL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 645fae48dc6b..881524b9a55a 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -258,7 +258,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
spin_unlock(&fi->lock);
}
kfree(forget);
- if (ret == -ENOMEM)
+ if (ret == -ENOMEM || ret == -EINTR)
goto out;
if (ret || fuse_invalid_attr(&outarg.attr) ||
fuse_stale_inode(inode, outarg.generation, &outarg.attr))
@@ -395,8 +395,6 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
goto out_put_forget;
err = -EIO;
- if (!outarg->nodeid)
- goto out_put_forget;
if (fuse_invalid_attr(&outarg->attr))
goto out_put_forget;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 0b435b23a97f..549358ffea8b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1132,7 +1132,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
process_init_limits(fc, arg);
if (arg->minor >= 6) {
- u64 flags = arg->flags | (u64) arg->flags2 << 32;
+ u64 flags = arg->flags;
+
+ if (flags & FUSE_INIT_EXT)
+ flags |= (u64) arg->flags2 << 32;
ra_pages = arg->max_readahead / PAGE_SIZE;
if (flags & FUSE_ASYNC_READ)
@@ -1252,7 +1255,8 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
- FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP;
+ FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
+ FUSE_HAS_EXPIRE_ONLY;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
flags |= FUSE_MAP_ALIGNMENT;
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 8e01bfdfc430..726640fa439e 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -9,14 +9,23 @@
#include <linux/compat.h>
#include <linux/fileattr.h>
-static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args)
+static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args,
+ struct fuse_ioctl_out *outarg)
{
- ssize_t ret = fuse_simple_request(fm, args);
+ ssize_t ret;
+
+ args->out_args[0].size = sizeof(*outarg);
+ args->out_args[0].value = outarg;
+
+ ret = fuse_simple_request(fm, args);
/* Translate ENOSYS, which shouldn't be returned from fs */
if (ret == -ENOSYS)
ret = -ENOTTY;
+ if (ret >= 0 && outarg->result == -ENOSYS)
+ outarg->result = -ENOTTY;
+
return ret;
}
@@ -264,13 +273,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
}
ap.args.out_numargs = 2;
- ap.args.out_args[0].size = sizeof(outarg);
- ap.args.out_args[0].value = &outarg;
ap.args.out_args[1].size = out_size;
ap.args.out_pages = true;
ap.args.out_argvar = true;
- transferred = fuse_send_ioctl(fm, &ap.args);
+ transferred = fuse_send_ioctl(fm, &ap.args, &outarg);
err = transferred;
if (transferred < 0)
goto out;
@@ -399,12 +406,10 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff,
args.in_args[1].size = inarg.in_size;
args.in_args[1].value = ptr;
args.out_numargs = 2;
- args.out_args[0].size = sizeof(outarg);
- args.out_args[0].value = &outarg;
args.out_args[1].size = inarg.out_size;
args.out_args[1].value = ptr;
- err = fuse_send_ioctl(fm, &args);
+ err = fuse_send_ioctl(fm, &args, &outarg);
if (!err) {
if (outarg.result < 0)
err = outarg.result;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ecbfbc6df621..766186c80682 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1578,7 +1578,7 @@ const struct file_operations gfs2_file_fops = {
.fsync = gfs2_fsync,
.lock = gfs2_lock,
.flock = gfs2_flock,
- .splice_read = filemap_splice_read,
+ .splice_read = copy_splice_read,
.splice_write = gfs2_file_splice_write,
.setlease = simple_nosetlease,
.fallocate = gfs2_fallocate,
@@ -1609,7 +1609,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
- .splice_read = filemap_splice_read,
+ .splice_read = copy_splice_read,
.splice_write = gfs2_file_splice_write,
.setlease = generic_setlease,
.fallocate = gfs2_fallocate,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index ec1631257978..7e835be7032d 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -230,9 +230,11 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+ struct super_block *sb = sdp->sd_vfs;
struct gfs2_bufdata *bd;
struct gfs2_meta_header *mh;
struct gfs2_trans *tr = current->journal_info;
+ bool withdraw = false;
lock_buffer(bh);
if (buffer_pinned(bh)) {
@@ -266,13 +268,15 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
(unsigned long long)bd->bd_bh->b_blocknr);
BUG();
}
- if (unlikely(test_bit(SDF_FROZEN, &sdp->sd_flags))) {
- fs_info(sdp, "GFS2:adding buf while frozen\n");
- gfs2_assert_withdraw(sdp, 0);
- }
if (unlikely(gfs2_withdrawn(sdp))) {
fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n",
(unsigned long long)bd->bd_bh->b_blocknr);
+ goto out_unlock;
+ }
+ if (unlikely(sb->s_writers.frozen == SB_FREEZE_COMPLETE)) {
+ fs_info(sdp, "GFS2:adding buf while frozen\n");
+ withdraw = true;
+ goto out_unlock;
}
gfs2_pin(sdp, bd->bd_bh);
mh->__pad0 = cpu_to_be64(0);
@@ -281,6 +285,8 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
tr->tr_num_buf_new++;
out_unlock:
gfs2_log_unlock(sdp);
+ if (withdraw)
+ gfs2_assert_withdraw(sdp, 0);
out:
unlock_buffer(bh);
}
diff --git a/fs/inode.c b/fs/inode.c
index f55957ac80e6..0ad4fb4b8fe9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -16,7 +16,6 @@
#include <linux/fsnotify.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
-#include <linux/prefetch.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
@@ -1041,8 +1040,6 @@ struct inode *new_inode(struct super_block *sb)
{
struct inode *inode;
- spin_lock_prefetch(&sb->s_inode_list_lock);
-
inode = new_inode_pseudo(sb);
if (inode)
inode_sb_list_add(inode);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index adb92cdb24b0..aa8967cca1a3 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -872,10 +872,10 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_write_iter(&iter, i);
- if (unlikely(ret < 0))
+ if (unlikely(iter.pos == iocb->ki_pos))
return ret;
ret = iter.pos - iocb->ki_pos;
- iocb->ki_pos += ret;
+ iocb->ki_pos = iter.pos;
return ret;
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 51bd38da21cd..9ec91017a7f3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -27,7 +27,7 @@
*
* Called with j_list_lock held.
*/
-static inline void __buffer_unlink_first(struct journal_head *jh)
+static inline void __buffer_unlink(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
@@ -41,45 +41,6 @@ static inline void __buffer_unlink_first(struct journal_head *jh)
}
/*
- * Unlink a buffer from a transaction checkpoint(io) list.
- *
- * Called with j_list_lock held.
- */
-static inline void __buffer_unlink(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
- if (transaction->t_checkpoint_io_list == jh) {
- transaction->t_checkpoint_io_list = jh->b_cpnext;
- if (transaction->t_checkpoint_io_list == jh)
- transaction->t_checkpoint_io_list = NULL;
- }
-}
-
-/*
- * Move a buffer from the checkpoint list to the checkpoint io list
- *
- * Called with j_list_lock held
- */
-static inline void __buffer_relink_io(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
-
- if (!transaction->t_checkpoint_io_list) {
- jh->b_cpnext = jh->b_cpprev = jh;
- } else {
- jh->b_cpnext = transaction->t_checkpoint_io_list;
- jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
- jh->b_cpprev->b_cpnext = jh;
- jh->b_cpnext->b_cpprev = jh;
- }
- transaction->t_checkpoint_io_list = jh;
-}
-
-/*
* Check a checkpoint buffer could be release or not.
*
* Requires j_list_lock
@@ -183,6 +144,7 @@ __flush_batch(journal_t *journal, int *batch_count)
struct buffer_head *bh = journal->j_chkpt_bhs[i];
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
+ journal->j_chkpt_bhs[i] = NULL;
}
*batch_count = 0;
}
@@ -242,15 +204,6 @@ restart:
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- goto retry;
- }
if (jh->b_transaction != NULL) {
transaction_t *t = jh->b_transaction;
tid_t tid = t->t_tid;
@@ -285,30 +238,50 @@ restart:
spin_lock(&journal->j_list_lock);
goto restart;
}
- if (!buffer_dirty(bh)) {
+ if (!trylock_buffer(bh)) {
+ /*
+ * The buffer is locked, it may be writing back, or
+ * flushing out in the last couple of cycles, or
+ * re-adding into a new transaction, need to check
+ * it again until it's unlocked.
+ */
+ get_bh(bh);
+ spin_unlock(&journal->j_list_lock);
+ wait_on_buffer(bh);
+ /* the journal_head may have gone by now */
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ goto retry;
+ } else if (!buffer_dirty(bh)) {
+ unlock_buffer(bh);
BUFFER_TRACE(bh, "remove from checkpoint");
- if (__jbd2_journal_remove_checkpoint(jh))
- /* The transaction was released; we're done */
+ /*
+ * If the transaction was released or the checkpoint
+ * list was empty, we're done.
+ */
+ if (__jbd2_journal_remove_checkpoint(jh) ||
+ !transaction->t_checkpoint_list)
goto out;
- continue;
+ } else {
+ unlock_buffer(bh);
+ /*
+ * We are about to write the buffer, it could be
+ * raced by some other transaction shrink or buffer
+ * re-log logic once we release the j_list_lock,
+ * leave it on the checkpoint list and check status
+ * again to make sure it's clean.
+ */
+ BUFFER_TRACE(bh, "queue");
+ get_bh(bh);
+ J_ASSERT_BH(bh, !buffer_jwrite(bh));
+ journal->j_chkpt_bhs[batch_count++] = bh;
+ transaction->t_chp_stats.cs_written++;
+ transaction->t_checkpoint_list = jh->b_cpnext;
}
- /*
- * Important: we are about to write the buffer, and
- * possibly block, while still holding the journal
- * lock. We cannot afford to let the transaction
- * logic start messing around with this buffer before
- * we write it to disk, as that would break
- * recoverability.
- */
- BUFFER_TRACE(bh, "queue");
- get_bh(bh);
- J_ASSERT_BH(bh, !buffer_jwrite(bh));
- journal->j_chkpt_bhs[batch_count++] = bh;
- __buffer_relink_io(jh);
- transaction->t_chp_stats.cs_written++;
+
if ((batch_count == JBD2_NR_BATCH) ||
- need_resched() ||
- spin_needbreak(&journal->j_list_lock))
+ need_resched() || spin_needbreak(&journal->j_list_lock) ||
+ jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0])
goto unlock_and_flush;
}
@@ -322,38 +295,6 @@ restart:
goto restart;
}
- /*
- * Now we issued all of the transaction's buffers, let's deal
- * with the buffers that are out for I/O.
- */
-restart2:
- /* Did somebody clean up the transaction in the meanwhile? */
- if (journal->j_checkpoint_transactions != transaction ||
- transaction->t_tid != this_tid)
- goto out;
-
- while (transaction->t_checkpoint_io_list) {
- jh = transaction->t_checkpoint_io_list;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- spin_lock(&journal->j_list_lock);
- goto restart2;
- }
-
- /*
- * Now in whatever state the buffer currently is, we
- * know that it has been written out and so we can
- * drop it from the list
- */
- if (__jbd2_journal_remove_checkpoint(jh))
- break;
- }
out:
spin_unlock(&journal->j_list_lock);
result = jbd2_cleanup_journal_tail(journal);
@@ -409,49 +350,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
/* Checkpoint list management */
/*
- * journal_clean_one_cp_list
- *
- * Find all the written-back checkpoint buffers in the given list and
- * release them. If 'destroy' is set, clean all buffers unconditionally.
- *
- * Called with j_list_lock held.
- * Returns 1 if we freed the transaction, 0 otherwise.
- */
-static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
-{
- struct journal_head *last_jh;
- struct journal_head *next_jh = jh;
-
- if (!jh)
- return 0;
-
- last_jh = jh->b_cpprev;
- do {
- jh = next_jh;
- next_jh = jh->b_cpnext;
-
- if (!destroy && __cp_buffer_busy(jh))
- return 0;
-
- if (__jbd2_journal_remove_checkpoint(jh))
- return 1;
- /*
- * This function only frees up some memory
- * if possible so we dont have an obligation
- * to finish processing. Bail out if preemption
- * requested:
- */
- if (need_resched())
- return 0;
- } while (jh != last_jh);
-
- return 0;
-}
-
-/*
* journal_shrink_one_cp_list
*
- * Find 'nr_to_scan' written-back checkpoint buffers in the given list
+ * Find all the written-back checkpoint buffers in the given list
* and try to release them. If the whole transaction is released, set
* the 'released' parameter. Return the number of released checkpointed
* buffers.
@@ -459,15 +360,15 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
* Called with j_list_lock held.
*/
static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
- unsigned long *nr_to_scan,
- bool *released)
+ bool destroy, bool *released)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
unsigned long nr_freed = 0;
int ret;
- if (!jh || *nr_to_scan == 0)
+ *released = false;
+ if (!jh)
return 0;
last_jh = jh->b_cpprev;
@@ -475,12 +376,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
jh = next_jh;
next_jh = jh->b_cpnext;
- (*nr_to_scan)--;
- if (__cp_buffer_busy(jh))
- continue;
+ if (destroy) {
+ ret = __jbd2_journal_remove_checkpoint(jh);
+ } else {
+ ret = jbd2_journal_try_remove_checkpoint(jh);
+ if (ret < 0)
+ continue;
+ }
nr_freed++;
- ret = __jbd2_journal_remove_checkpoint(jh);
if (ret) {
*released = true;
break;
@@ -488,7 +392,7 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
if (need_resched())
break;
- } while (jh != last_jh && *nr_to_scan);
+ } while (jh != last_jh);
return nr_freed;
}
@@ -506,11 +410,11 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal,
unsigned long *nr_to_scan)
{
transaction_t *transaction, *last_transaction, *next_transaction;
- bool released;
+ bool __maybe_unused released;
tid_t first_tid = 0, last_tid = 0, next_tid = 0;
tid_t tid = 0;
unsigned long nr_freed = 0;
- unsigned long nr_scanned = *nr_to_scan;
+ unsigned long freed;
again:
spin_lock(&journal->j_list_lock);
@@ -539,19 +443,11 @@ again:
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
tid = transaction->t_tid;
- released = false;
-
- nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list,
- nr_to_scan, &released);
- if (*nr_to_scan == 0)
- break;
- if (need_resched() || spin_needbreak(&journal->j_list_lock))
- break;
- if (released)
- continue;
- nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list,
- nr_to_scan, &released);
+ freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list,
+ false, &released);
+ nr_freed += freed;
+ (*nr_to_scan) -= min(*nr_to_scan, freed);
if (*nr_to_scan == 0)
break;
if (need_resched() || spin_needbreak(&journal->j_list_lock))
@@ -572,9 +468,8 @@ again:
if (*nr_to_scan && next_tid)
goto again;
out:
- nr_scanned -= *nr_to_scan;
trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid,
- nr_freed, nr_scanned, next_tid);
+ nr_freed, next_tid);
return nr_freed;
}
@@ -590,7 +485,7 @@ out:
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
{
transaction_t *transaction, *last_transaction, *next_transaction;
- int ret;
+ bool released;
transaction = journal->j_checkpoint_transactions;
if (!transaction)
@@ -601,8 +496,8 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
- ret = journal_clean_one_cp_list(transaction->t_checkpoint_list,
- destroy);
+ journal_shrink_one_cp_list(transaction->t_checkpoint_list,
+ destroy, &released);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
@@ -610,23 +505,12 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
*/
if (need_resched())
return;
- if (ret)
- continue;
- /*
- * It is essential that we are as careful as in the case of
- * t_checkpoint_list with removing the buffer from the list as
- * we can possibly see not yet submitted buffers on io_list
- */
- ret = journal_clean_one_cp_list(transaction->
- t_checkpoint_io_list, destroy);
- if (need_resched())
- return;
/*
* Stop scanning if we couldn't free the transaction. This
* avoids pointless scanning of transactions which still
* weren't checkpointed.
*/
- if (!ret)
+ if (!released)
return;
} while (transaction != last_transaction);
}
@@ -705,7 +589,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
jbd2_journal_put_journal_head(jh);
/* Is this transaction empty? */
- if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list)
+ if (transaction->t_checkpoint_list)
return 0;
/*
@@ -737,6 +621,34 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
}
/*
+ * Check the checkpoint buffer and try to remove it from the checkpoint
+ * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if
+ * it frees the transaction, 0 otherwise.
+ *
+ * This function is called with j_list_lock held.
+ */
+int jbd2_journal_try_remove_checkpoint(struct journal_head *jh)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (!trylock_buffer(bh))
+ return -EBUSY;
+ if (buffer_dirty(bh)) {
+ unlock_buffer(bh);
+ return -EBUSY;
+ }
+ unlock_buffer(bh);
+
+ /*
+ * Buffer is clean and the IO has finished (we held the buffer
+ * lock) so the checkpoint is done. We can safely remove the
+ * buffer from this transaction.
+ */
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ return __jbd2_journal_remove_checkpoint(jh);
+}
+
+/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
* the log.
@@ -797,7 +709,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
- J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(atomic_read(&transaction->t_updates) == 0);
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b33155dd7001..1073259902a6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -1141,8 +1141,7 @@ restart_loop:
spin_lock(&journal->j_list_lock);
commit_transaction->t_state = T_FINISHED;
/* Check if the transaction can be dropped now that we are finished */
- if (commit_transaction->t_checkpoint_list == NULL &&
- commit_transaction->t_checkpoint_io_list == NULL) {
+ if (commit_transaction->t_checkpoint_list == NULL) {
__jbd2_journal_drop_transaction(journal, commit_transaction);
jbd2_journal_free_transaction(commit_transaction);
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 18611241f451..4d1fda1f7143 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1784,8 +1784,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
* Otherwise, if the buffer has been written to disk,
* it is safe to remove the checkpoint and drop it.
*/
- if (!buffer_dirty(bh)) {
- __jbd2_journal_remove_checkpoint(jh);
+ if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
spin_unlock(&journal->j_list_lock);
goto drop;
}
@@ -2100,35 +2099,6 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
__brelse(bh);
}
-/*
- * Called from jbd2_journal_try_to_free_buffers().
- *
- * Called under jh->b_state_lock
- */
-static void
-__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
-{
- struct journal_head *jh;
-
- jh = bh2jh(bh);
-
- if (buffer_locked(bh) || buffer_dirty(bh))
- goto out;
-
- if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
- goto out;
-
- spin_lock(&journal->j_list_lock);
- if (jh->b_cp_transaction != NULL) {
- /* written-back checkpointed metadata buffer */
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __jbd2_journal_remove_checkpoint(jh);
- }
- spin_unlock(&journal->j_list_lock);
-out:
- return;
-}
-
/**
* jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
@@ -2186,7 +2156,13 @@ bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
continue;
spin_lock(&jh->b_state_lock);
- __journal_try_to_free_buffer(journal, bh);
+ if (!jh->b_transaction && !jh->b_next_transaction) {
+ spin_lock(&journal->j_list_lock);
+ /* Remove written-back checkpointed metadata buffer */
+ if (jh->b_cp_transaction != NULL)
+ jbd2_journal_try_remove_checkpoint(jh);
+ spin_unlock(&journal->j_list_lock);
+ }
spin_unlock(&jh->b_state_lock);
jbd2_journal_put_journal_head(jh);
if (buffer_jbd(bh))
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 541578126b1a..029d47065600 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1535,9 +1535,10 @@ const struct inode_operations jfs_dir_inode_operations = {
#endif
};
+WRAP_DIR_ITER(jfs_readdir) // FIXME!
const struct file_operations jfs_dir_operations = {
.read = generic_read_dir,
- .iterate = jfs_readdir,
+ .iterate_shared = shared_jfs_readdir,
.fsync = jfs_fsync,
.unlocked_ioctl = jfs_ioctl,
.compat_ioctl = compat_ptr_ioctl,
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9a18c5a69ace..aaffaaa336cc 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -472,20 +472,26 @@ out:
return result;
}
-static void
-nfs_direct_join_group(struct list_head *list, struct inode *inode)
+static void nfs_direct_join_group(struct list_head *list, struct inode *inode)
{
- struct nfs_page *req, *next;
+ struct nfs_page *req, *subreq;
list_for_each_entry(req, list, wb_list) {
- if (req->wb_head != req || req->wb_this_page == req)
+ if (req->wb_head != req)
continue;
- for (next = req->wb_this_page;
- next != req->wb_head;
- next = next->wb_this_page) {
- nfs_list_remove_request(next);
- nfs_release_request(next);
- }
+ subreq = req->wb_this_page;
+ if (subreq == req)
+ continue;
+ do {
+ /*
+ * Remove subrequests from this list before freeing
+ * them in the call to nfs_join_page_group().
+ */
+ if (!list_empty(&subreq->wb_list)) {
+ nfs_list_remove_request(subreq);
+ nfs_release_request(subreq);
+ }
+ } while ((subreq = subreq->wb_this_page) != req);
nfs_join_page_group(req, inode);
}
}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 63802d195556..49f78e23b34c 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1377,7 +1377,6 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
for (i = 0; i < np; i++) {
pages[i] = alloc_page(GFP_KERNEL);
if (!pages[i]) {
- np = i + 1;
err = -ENOMEM;
goto out;
}
@@ -1401,8 +1400,8 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
} while (exception.retry);
out:
- while (--np >= 0)
- __free_page(pages[np]);
+ while (--i >= 0)
+ __free_page(pages[i]);
kfree(pages);
return err;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e1a886b58354..832fa226b8f2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6004,9 +6004,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf,
out_ok:
ret = res.acl_len;
out_free:
- for (i = 0; i < npages; i++)
- if (pages[i])
- __free_page(pages[i]);
+ while (--i >= 0)
+ __free_page(pages[i]);
if (res.acl_scratch)
__free_page(res.acl_scratch);
kfree(pages);
@@ -7181,8 +7180,15 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
} else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
goto out_restart;
break;
- case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_OLD_STATEID:
+ if (data->arg.new_lock_owner != 0 &&
+ nfs4_refresh_open_old_stateid(&data->arg.open_stateid,
+ lsp->ls_state))
+ goto out_restart;
+ if (nfs4_refresh_lock_old_stateid(&data->arg.lock_stateid, lsp))
+ goto out_restart;
+ fallthrough;
+ case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
if (data->arg.new_lock_owner != 0) {
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index acda8f033d30..bf378ecd5d9f 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -345,8 +345,10 @@ void nfs_sysfs_move_sb_to_server(struct nfs_server *server)
int ret = -ENOMEM;
s = kasprintf(GFP_KERNEL, "server-%d", server->s_sysfs_id);
- if (s)
+ if (s) {
ret = kobject_rename(&server->kobj, s);
+ kfree(s);
+ }
if (ret < 0)
pr_warn("NFS: rename sysfs %s failed (%d)\n",
server->kobj.name, ret);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6e61fa3acaf1..daf305daa751 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1354,9 +1354,9 @@ static void revoke_delegation(struct nfs4_delegation *dp)
trace_nfsd_stid_revoke(&dp->dl_stid);
if (clp->cl_minorversion) {
+ spin_lock(&clp->cl_lock);
dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
refcount_inc(&dp->dl_stid.sc_count);
- spin_lock(&clp->cl_lock);
list_add(&dp->dl_recall_lru, &clp->cl_revoked);
spin_unlock(&clp->cl_lock);
}
@@ -6341,8 +6341,6 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
CLOSE_STATEID(stateid))
return status;
- if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid))
- return status;
spin_lock(&cl->cl_lock);
s = find_stateid_locked(cl, stateid);
if (!s)
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index a53c5660a8c4..3709830f90a6 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1105,6 +1105,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
if (!nn->nfsd_serv)
return -EBUSY;
trace_nfsd_end_grace(netns(file));
+ nfsd4_end_grace(nn);
break;
default:
return -EINVAL;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 40a68bae88fc..9b7acba382fe 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -956,10 +956,13 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
last_page = page + (offset + sd->len - 1) / PAGE_SIZE;
for (page += offset / PAGE_SIZE; page <= last_page; page++) {
/*
- * Skip page replacement when extending the contents
- * of the current page.
+ * Skip page replacement when extending the contents of the
+ * current page. But note that we may get two zero_pages in a
+ * row from shmem.
*/
- if (page == *(rqstp->rq_next_page - 1))
+ if (page == *(rqstp->rq_next_page - 1) &&
+ offset_in_page(rqstp->rq_res.page_base +
+ rqstp->rq_res.page_len))
continue;
if (unlikely(!svc_rqst_replace_page(rqstp, page)))
return -EIO;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 5259b94ca1dc..d588c719d743 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -1101,9 +1101,17 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
{
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
struct buffer_head *ibh;
int err;
+ /*
+ * Do not dirty inodes after the log writer has been detached
+ * and its nilfs_root struct has been freed.
+ */
+ if (unlikely(nilfs_purging(nilfs)))
+ return 0;
+
err = nilfs_load_inode_block(inode, &ibh);
if (unlikely(err)) {
nilfs_warn(inode->i_sb,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c2553024bd25..7ec16879756e 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -725,6 +725,11 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
struct folio *folio = fbatch.folios[i];
folio_lock(folio);
+ if (unlikely(folio->mapping != mapping)) {
+ /* Exclude folios removed from the address space */
+ folio_unlock(folio);
+ continue;
+ }
head = folio_buffers(folio);
if (!head) {
create_empty_buffers(&folio->page, i_blocksize(inode), 0);
@@ -2845,6 +2850,7 @@ void nilfs_detach_log_writer(struct super_block *sb)
nilfs_segctor_destroy(nilfs->ns_writer);
nilfs->ns_writer = NULL;
}
+ set_nilfs_purging(nilfs);
/* Force to free the list of dirty files */
spin_lock(&nilfs->ns_inode_lock);
@@ -2857,4 +2863,5 @@ void nilfs_detach_log_writer(struct super_block *sb)
up_write(&nilfs->ns_segctor_sem);
nilfs_dispose_list(nilfs, &garbage_list, 1);
+ clear_nilfs_purging(nilfs);
}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 47c7dfbb7ea5..cd4ae1b8ae16 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@ enum {
THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
THE_NILFS_GC_RUNNING, /* gc process is running */
THE_NILFS_SB_DIRTY, /* super block is dirty */
+ THE_NILFS_PURGING, /* disposing dirty files for cleanup */
};
/**
@@ -208,6 +209,7 @@ THE_NILFS_FNS(INIT, init)
THE_NILFS_FNS(DISCONTINUED, discontinued)
THE_NILFS_FNS(GC_RUNNING, gc_running)
THE_NILFS_FNS(SB_DIRTY, sb_dirty)
+THE_NILFS_FNS(PURGING, purging)
/*
* Mount option operations
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 52ccd34b1e79..a026dbd3593f 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -272,7 +272,7 @@ int unregister_nls(struct nls_table * nls)
return -EINVAL;
}
-static struct nls_table *find_nls(char *charset)
+static struct nls_table *find_nls(const char *charset)
{
struct nls_table *nls;
spin_lock(&nls_lock);
@@ -288,7 +288,7 @@ static struct nls_table *find_nls(char *charset)
return nls;
}
-struct nls_table *load_nls(char *charset)
+struct nls_table *load_nls(const char *charset)
{
return try_then_request_module(find_nls(charset), "nls_%s", charset);
}
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 518c3a21a556..4596c90e7b7c 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1525,10 +1525,11 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
#endif /* NTFS_RW */
+WRAP_DIR_ITER(ntfs_readdir) // FIXME!
const struct file_operations ntfs_dir_ops = {
.llseek = generic_file_llseek, /* Seek inside directory. */
.read = generic_read_dir, /* Return -EISDIR. */
- .iterate = ntfs_readdir, /* Read directory contents. */
+ .iterate_shared = shared_ntfs_readdir, /* Read directory contents. */
#ifdef NTFS_RW
.fsync = ntfs_dir_fsync, /* Sync a directory to disk. */
#endif /* NTFS_RW */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8184499ae7a5..3b91b4cc7c6a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2795,10 +2795,11 @@ const struct file_operations ocfs2_fops = {
.remap_file_range = ocfs2_remap_file_range,
};
+WRAP_DIR_ITER(ocfs2_readdir) // FIXME!
const struct file_operations ocfs2_dops = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = ocfs2_readdir,
+ .iterate_shared = shared_ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
@@ -2844,7 +2845,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
const struct file_operations ocfs2_dops_no_plocks = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .iterate = ocfs2_readdir,
+ .iterate_shared = shared_ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
diff --git a/fs/open.c b/fs/open.c
index 0c55c8e7f837..e6ead0f19964 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1322,7 +1322,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
lookup_flags |= LOOKUP_IN_ROOT;
if (how->resolve & RESOLVE_CACHED) {
/* Don't bother even trying for create/truncate/tmpfile open */
- if (flags & (O_TRUNC | O_CREAT | O_TMPFILE))
+ if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
return -EAGAIN;
lookup_flags |= LOOKUP_CACHED;
}
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index ee5c4736480f..de39e067ae65 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -954,10 +954,11 @@ static int ovl_dir_open(struct inode *inode, struct file *file)
return 0;
}
+WRAP_DIR_ITER(ovl_iterate) // FIXME!
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
- .iterate = ovl_iterate,
+ .iterate_shared = shared_ovl_iterate,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 5b069f1a1e44..cc8977498c48 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1460,7 +1460,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
ovl_trusted_xattr_handlers;
sb->s_fs_info = ofs;
sb->s_flags |= SB_POSIXACL;
- sb->s_iflags |= SB_I_SKIP_SYNC;
+ sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE;
err = -ENOMEM;
root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2fcb393836ab..2c009d5d9282 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2817,7 +2817,7 @@ static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
\
static const struct file_operations proc_##LSM##_attr_dir_ops = { \
.read = generic_read_dir, \
- .iterate = proc_##LSM##_attr_dir_iterate, \
+ .iterate_shared = proc_##LSM##_attr_dir_iterate, \
.llseek = default_llseek, \
}; \
\
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 9cb32e1a78a0..23fc24d16b31 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -309,6 +309,8 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct file *file = iocb->ki_filp;
+ char *buf = file->private_data;
loff_t *fpos = &iocb->ki_pos;
size_t phdrs_offset, notes_offset, data_offset;
size_t page_offline_frozen = 1;
@@ -555,10 +557,21 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
case KCORE_VMEMMAP:
case KCORE_TEXT:
/*
- * We use _copy_to_iter() to bypass usermode hardening
- * which would otherwise prevent this operation.
+ * Sadly we must use a bounce buffer here to be able to
+ * make use of copy_from_kernel_nofault(), as these
+ * memory regions might not always be mapped on all
+ * architectures.
*/
- if (_copy_to_iter((char *)start, tsz, iter) != tsz) {
+ if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
+ }
+ /*
+ * We know the bounce buffer is safe to copy from, so
+ * use _copy_to_iter() directly.
+ */
+ } else if (_copy_to_iter(buf, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -595,6 +608,10 @@ static int open_kcore(struct inode *inode, struct file *filp)
if (ret)
return ret;
+ filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!filp->private_data)
+ return -ENOMEM;
+
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
@@ -605,9 +622,16 @@ static int open_kcore(struct inode *inode, struct file *filp)
return 0;
}
+static int release_kcore(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
static const struct proc_ops kcore_proc_ops = {
.proc_read_iter = read_kcore_iter,
.proc_open = open_kcore,
+ .proc_release = release_kcore,
.proc_lseek = default_llseek,
};
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 507cd4e59d07..fafff1bd34cd 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -587,8 +587,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
bool migration = false;
if (pmd_present(*pmd)) {
- /* FOLL_DUMP will return -EFAULT on huge zero page */
- page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
+ page = vm_normal_page_pmd(vma, addr, *pmd);
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
@@ -758,12 +757,14 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
static const struct mm_walk_ops smaps_walk_ops = {
.pmd_entry = smaps_pte_range,
.hugetlb_entry = smaps_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static const struct mm_walk_ops smaps_shmem_walk_ops = {
.pmd_entry = smaps_pte_range,
.hugetlb_entry = smaps_hugetlb_range,
.pte_hole = smaps_pte_hole,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -1245,6 +1246,7 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
static const struct mm_walk_ops clear_refs_walk_ops = {
.pmd_entry = clear_refs_pte_range,
.test_walk = clear_refs_test_walk,
+ .walk_lock = PGWALK_WRLOCK,
};
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
@@ -1622,6 +1624,7 @@ static const struct mm_walk_ops pagemap_ops = {
.pmd_entry = pagemap_pmd_range,
.pte_hole = pagemap_pte_hole,
.hugetlb_entry = pagemap_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -1935,6 +1938,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
static const struct mm_walk_ops show_numa_ops = {
.hugetlb_entry = gather_hugetlb_stats,
.pmd_entry = gather_pte_stats,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cb80a7703d58..1fb213f379a5 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -132,7 +132,7 @@ ssize_t read_from_oldmem(struct iov_iter *iter, size_t count,
u64 *ppos, bool encrypted)
{
unsigned long pfn, offset;
- size_t nr_bytes;
+ ssize_t nr_bytes;
ssize_t read = 0, tmp;
int idx;
diff --git a/fs/readdir.c b/fs/readdir.c
index b264ce60114d..c8c46e294431 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -25,6 +25,53 @@
#include <asm/unaligned.h>
/*
+ * Some filesystems were never converted to '->iterate_shared()'
+ * and their directory iterators want the inode lock held for
+ * writing. This wrapper allows for converting from the shared
+ * semantics to the exclusive inode use.
+ */
+int wrap_directory_iterator(struct file *file,
+ struct dir_context *ctx,
+ int (*iter)(struct file *, struct dir_context *))
+{
+ struct inode *inode = file_inode(file);
+ int ret;
+
+ /*
+ * We'd love to have an 'inode_upgrade_trylock()' operation,
+ * see the comment in mmap_upgrade_trylock() in mm/memory.c.
+ *
+ * But considering this is for "filesystems that never got
+ * converted", it really doesn't matter.
+ *
+ * Also note that since we have to return with the lock held
+ * for reading, we can't use the "killable()" locking here,
+ * since we do need to get the lock even if we're dying.
+ *
+ * We could do the write part killably and then get the read
+ * lock unconditionally if it mattered, but see above on why
+ * this does the very simplistic conversion.
+ */
+ up_read(&inode->i_rwsem);
+ down_write(&inode->i_rwsem);
+
+ /*
+ * Since we dropped the inode lock, we should do the
+ * DEADDIR test again. See 'iterate_dir()' below.
+ *
+ * Note that we don't need to re-do the f_pos games,
+ * since the file must be locked wrt f_pos anyway.
+ */
+ ret = -ENOENT;
+ if (!IS_DEADDIR(inode))
+ ret = iter(file, ctx);
+
+ downgrade_write(&inode->i_rwsem);
+ return ret;
+}
+EXPORT_SYMBOL(wrap_directory_iterator);
+
+/*
* Note the "unsafe_put_user() semantics: we goto a
* label for errors.
*/
@@ -40,39 +87,28 @@
int iterate_dir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
- bool shared = false;
int res = -ENOTDIR;
- if (file->f_op->iterate_shared)
- shared = true;
- else if (!file->f_op->iterate)
+
+ if (!file->f_op->iterate_shared)
goto out;
res = security_file_permission(file, MAY_READ);
if (res)
goto out;
- if (shared)
- res = down_read_killable(&inode->i_rwsem);
- else
- res = down_write_killable(&inode->i_rwsem);
+ res = down_read_killable(&inode->i_rwsem);
if (res)
goto out;
res = -ENOENT;
if (!IS_DEADDIR(inode)) {
ctx->pos = file->f_pos;
- if (shared)
- res = file->f_op->iterate_shared(file, ctx);
- else
- res = file->f_op->iterate(file, ctx);
+ res = file->f_op->iterate_shared(file, ctx);
file->f_pos = ctx->pos;
fsnotify_access(file);
file_accessed(file);
}
- if (shared)
- inode_unlock_shared(inode);
- else
- inode_unlock(inode);
+ inode_unlock_shared(inode);
out:
return res;
}
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index fb4162a52844..aec6e9137474 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -153,6 +153,11 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan)
in_flight(server),
atomic_read(&server->in_send),
atomic_read(&server->num_waiters));
+#ifdef CONFIG_NET_NS
+ if (server->net)
+ seq_printf(m, " Net namespace: %u ", server->net->ns.inum);
+#endif /* NET_NS */
+
}
static inline const char *smb_speed_to_str(size_t bps)
@@ -430,10 +435,15 @@ skip_rdma:
server->reconnect_instance,
server->srv_count,
server->sec_mode, in_flight(server));
+#ifdef CONFIG_NET_NS
+ if (server->net)
+ seq_printf(m, " Net namespace: %u ", server->net->ns.inum);
+#endif /* NET_NS */
seq_printf(m, "\nIn Send: %d In MaxReq Wait: %d",
atomic_read(&server->in_send),
atomic_read(&server->num_waiters));
+
if (server->leaf_fullpath) {
seq_printf(m, "\nDFS leaf full path: %s",
server->leaf_fullpath);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index d7274eefc666..15c8cc4b6680 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -159,6 +159,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 43
-#define CIFS_VERSION "2.43"
+#define SMB3_PRODUCT_BUILD 44
+#define CIFS_VERSION "2.44"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index b5808fe3469a..657dee4b2c8c 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -532,7 +532,7 @@ struct smb_version_operations {
/* Check for STATUS_IO_TIMEOUT */
bool (*is_status_io_timeout)(char *buf);
/* Check for STATUS_NETWORK_NAME_DELETED */
- void (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv);
+ bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv);
};
struct smb_version_values {
@@ -1062,6 +1062,7 @@ struct cifs_ses {
unsigned long chans_need_reconnect;
/* ========= end: protected by chan_lock ======== */
struct cifs_ses *dfs_root_ses;
+ struct nls_table *local_nls;
};
static inline bool
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 19f7385abeec..25503f1a4fd2 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -129,7 +129,7 @@ again:
}
spin_unlock(&server->srv_lock);
- nls_codepage = load_nls_default();
+ nls_codepage = ses->local_nls;
/*
* need to prevent multiple threads trying to simultaneously
@@ -200,7 +200,6 @@ out:
rc = -EAGAIN;
}
- unload_nls(nls_codepage);
return rc;
}
@@ -3184,7 +3183,7 @@ setAclRetry:
param_offset = offsetof(struct smb_com_transaction2_spi_req,
InformationLevel) - 4;
offset = param_offset + params;
- parm_data = ((char *) &pSMB->hdr.Protocol) + offset;
+ parm_data = ((char *)pSMB) + sizeof(pSMB->hdr.smb_buf_length) + offset;
pSMB->ParameterOffset = cpu_to_le16(param_offset);
/* convert to on the wire format for POSIX ACL */
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 85dd1b373974..238538dde4e3 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -60,7 +60,7 @@ extern bool disable_legacy_dialects;
#define TLINK_IDLE_EXPIRE (600 * HZ)
/* Drop the connection to not overload the server */
-#define NUM_STATUS_IO_TIMEOUT 5
+#define MAX_STATUS_IO_TIMEOUT 5
static int ip_connect(struct TCP_Server_Info *server);
static int generic_ip_connect(struct TCP_Server_Info *server);
@@ -1117,6 +1117,7 @@ cifs_demultiplex_thread(void *p)
struct mid_q_entry *mids[MAX_COMPOUND];
char *bufs[MAX_COMPOUND];
unsigned int noreclaim_flag, num_io_timeout = 0;
+ bool pending_reconnect = false;
noreclaim_flag = memalloc_noreclaim_save();
cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current));
@@ -1156,6 +1157,8 @@ cifs_demultiplex_thread(void *p)
cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length);
if (!is_smb_response(server, buf[0]))
continue;
+
+ pending_reconnect = false;
next_pdu:
server->pdu_size = pdu_length;
@@ -1213,10 +1216,13 @@ next_pdu:
if (server->ops->is_status_io_timeout &&
server->ops->is_status_io_timeout(buf)) {
num_io_timeout++;
- if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) {
- cifs_reconnect(server, false);
+ if (num_io_timeout > MAX_STATUS_IO_TIMEOUT) {
+ cifs_server_dbg(VFS,
+ "Number of request timeouts exceeded %d. Reconnecting",
+ MAX_STATUS_IO_TIMEOUT);
+
+ pending_reconnect = true;
num_io_timeout = 0;
- continue;
}
}
@@ -1226,9 +1232,14 @@ next_pdu:
if (mids[i] != NULL) {
mids[i]->resp_buf_size = server->pdu_size;
- if (bufs[i] && server->ops->is_network_name_deleted)
- server->ops->is_network_name_deleted(bufs[i],
- server);
+ if (bufs[i] != NULL) {
+ if (server->ops->is_network_name_deleted &&
+ server->ops->is_network_name_deleted(bufs[i],
+ server)) {
+ cifs_server_dbg(FYI,
+ "Share deleted. Reconnect needed");
+ }
+ }
if (!mids[i]->multiRsp || mids[i]->multiEnd)
mids[i]->callback(mids[i]);
@@ -1263,6 +1274,11 @@ next_pdu:
buf = server->smallbuf;
goto next_pdu;
}
+
+ /* do this reconnect at the very end after processing all MIDs */
+ if (pending_reconnect)
+ cifs_reconnect(server, true);
+
} /* end while !EXITING */
/* buffer usually freed in free_mid - need to free it here on exit */
@@ -1826,6 +1842,10 @@ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx)
CIFS_MAX_PASSWORD_LEN))
return 0;
}
+
+ if (strcmp(ctx->local_nls->charset, ses->local_nls->charset))
+ return 0;
+
return 1;
}
@@ -2270,6 +2290,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
ses->sectype = ctx->sectype;
ses->sign = ctx->sign;
+ ses->local_nls = load_nls(ctx->local_nls->charset);
/* add server as first channel */
spin_lock(&ses->chan_lock);
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index 1403a2d1ab17..ee772c3d9f00 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -66,6 +66,12 @@ static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path)
return rc;
}
+/*
+ * Track individual DFS referral servers used by new DFS mount.
+ *
+ * On success, their lifetime will be shared by final tcon (dfs_ses_list).
+ * Otherwise, they will be put by dfs_put_root_smb_sessions() in cifs_mount().
+ */
static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx)
{
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
@@ -80,11 +86,12 @@ static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx)
INIT_LIST_HEAD(&root_ses->list);
spin_lock(&cifs_tcp_ses_lock);
- ses->ses_count++;
+ cifs_smb_ses_inc_refcount(ses);
spin_unlock(&cifs_tcp_ses_lock);
root_ses->ses = ses;
list_add_tail(&root_ses->list, &mnt_ctx->dfs_ses_list);
}
+ /* Select new DFS referral server so that new referrals go through it */
ctx->dfs_root_ses = ses;
return 0;
}
@@ -170,8 +177,12 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
rc = dfs_get_referral(mnt_ctx, ref_path + 1, NULL, &tl);
- if (rc)
+ if (rc) {
+ rc = cifs_mount_get_tcon(mnt_ctx);
+ if (!rc)
+ rc = cifs_is_path_remote(mnt_ctx);
break;
+ }
tit = dfs_cache_get_tgt_iterator(&tl);
if (!tit) {
@@ -242,7 +253,6 @@ out:
int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
{
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- struct cifs_ses *ses;
bool nodfs = ctx->nodfs;
int rc;
@@ -276,20 +286,8 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
}
*isdfs = true;
- /*
- * Prevent DFS root session of being put in the first call to
- * cifs_mount_put_conns(). If another DFS root server was not found
- * while chasing the referrals (@ctx->dfs_root_ses == @ses), then we
- * can safely put extra refcount of @ses.
- */
- ses = mnt_ctx->ses;
- mnt_ctx->ses = NULL;
- mnt_ctx->server = NULL;
- rc = __dfs_mount_share(mnt_ctx);
- if (ses == ctx->dfs_root_ses)
- cifs_put_smb_ses(ses);
-
- return rc;
+ add_root_smb_session(mnt_ctx);
+ return __dfs_mount_share(mnt_ctx);
}
/* Update dfs referral path of superblock */
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 689058e1b6e6..2108b3b40ce9 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -1080,8 +1080,8 @@ int cifs_close(struct inode *inode, struct file *file)
cfile = file->private_data;
file->private_data = NULL;
dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL);
- if ((cinode->oplock == CIFS_CACHE_RHW_FLG) &&
- cinode->lease_granted &&
+ if ((cifs_sb->ctx->closetimeo && cinode->oplock == CIFS_CACHE_RHW_FLG)
+ && cinode->lease_granted &&
!test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) &&
dclose) {
if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
@@ -4681,9 +4681,9 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
io_error:
kunmap(page);
- unlock_page(page);
read_complete:
+ unlock_page(page);
return rc;
}
@@ -4878,9 +4878,11 @@ void cifs_oplock_break(struct work_struct *work)
struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
oplock_break);
struct inode *inode = d_inode(cfile->dentry);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsInodeInfo *cinode = CIFS_I(inode);
- struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
- struct TCP_Server_Info *server = tcon->ses->server;
+ struct cifs_tcon *tcon;
+ struct TCP_Server_Info *server;
+ struct tcon_link *tlink;
int rc = 0;
bool purge_cache = false, oplock_break_cancelled;
__u64 persistent_fid, volatile_fid;
@@ -4889,6 +4891,12 @@ void cifs_oplock_break(struct work_struct *work)
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
TASK_UNINTERRUPTIBLE);
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ goto out;
+ tcon = tlink_tcon(tlink);
+ server = tcon->ses->server;
+
server->ops->downgrade_oplock(server, cinode, cfile->oplock_level,
cfile->oplock_epoch, &purge_cache);
@@ -4938,18 +4946,19 @@ oplock_break_ack:
/*
* MS-SMB2 3.2.5.19.1 and 3.2.5.19.2 (and MS-CIFS 3.2.5.42) do not require
* an acknowledgment to be sent when the file has already been closed.
- * check for server null, since can race with kill_sb calling tree disconnect.
*/
spin_lock(&cinode->open_file_lock);
- if (tcon->ses && tcon->ses->server && !oplock_break_cancelled &&
- !list_empty(&cinode->openFileList)) {
+ /* check list empty since can race with kill_sb calling tree disconnect */
+ if (!oplock_break_cancelled && !list_empty(&cinode->openFileList)) {
spin_unlock(&cinode->open_file_lock);
- rc = tcon->ses->server->ops->oplock_response(tcon, persistent_fid,
- volatile_fid, net_fid, cinode);
+ rc = server->ops->oplock_response(tcon, persistent_fid,
+ volatile_fid, net_fid, cinode);
cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
} else
spin_unlock(&cinode->open_file_lock);
+ cifs_put_tlink(tlink);
+out:
cifs_done_oplock_break(cinode);
}
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 4946a0c59600..67e16c2ac90e 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -231,6 +231,8 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c
break;
case Opt_sec_none:
ctx->nullauth = 1;
+ kfree(ctx->username);
+ ctx->username = NULL;
break;
default:
cifs_errorf(fc, "bad security option: %s\n", value);
@@ -1201,6 +1203,8 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_user:
kfree(ctx->username);
ctx->username = NULL;
+ if (ctx->nullauth)
+ break;
if (strlen(param->string) == 0) {
/* null user, ie. anonymous authentication */
ctx->nullauth = 1;
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index fff092bbc7a3..f7160003e0ed 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -433,16 +433,21 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
* Dump encryption keys. This is an old ioctl that only
* handles AES-128-{CCM,GCM}.
*/
- if (pSMBFile == NULL)
- break;
if (!capable(CAP_SYS_ADMIN)) {
rc = -EACCES;
break;
}
- tcon = tlink_tcon(pSMBFile->tlink);
+ cifs_sb = CIFS_SB(inode->i_sb);
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink)) {
+ rc = PTR_ERR(tlink);
+ break;
+ }
+ tcon = tlink_tcon(tlink);
if (!smb3_encryption_required(tcon)) {
rc = -EOPNOTSUPP;
+ cifs_put_tlink(tlink);
break;
}
pkey_inf.cipher_type =
@@ -459,6 +464,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = -EFAULT;
else
rc = 0;
+ cifs_put_tlink(tlink);
break;
case CIFS_DUMP_FULL_KEY:
/*
@@ -470,8 +476,16 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = -EACCES;
break;
}
- tcon = tlink_tcon(pSMBFile->tlink);
+ cifs_sb = CIFS_SB(inode->i_sb);
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink)) {
+ rc = PTR_ERR(tlink);
+ break;
+ }
+
+ tcon = tlink_tcon(tlink);
rc = cifs_dump_full_key(tcon, (void __user *)arg);
+ cifs_put_tlink(tlink);
break;
case CIFS_IOC_NOTIFY:
if (!S_ISDIR(inode->i_mode)) {
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 70dbfe6584f9..d7e85d9a2655 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -95,6 +95,7 @@ sesInfoFree(struct cifs_ses *buf_to_free)
return;
}
+ unload_nls(buf_to_free->local_nls);
atomic_dec(&sesInfoAllocCount);
kfree(buf_to_free->serverOS);
kfree(buf_to_free->serverDomain);
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 335c078c42fb..c57ca2050b73 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -1013,6 +1013,7 @@ setup_ntlm_smb3_neg_ret:
}
+/* See MS-NLMP 2.2.1.3 */
int build_ntlmssp_auth_blob(unsigned char **pbuffer,
u16 *buflen,
struct cifs_ses *ses,
@@ -1047,7 +1048,8 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer,
flags = ses->ntlmssp->server_flags | NTLMSSP_REQUEST_TARGET |
NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED;
-
+ /* we only send version information in ntlmssp negotiate, so do not set this flag */
+ flags = flags & ~NTLMSSP_NEGOTIATE_VERSION;
tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE);
sec_blob->NegotiateFlags = cpu_to_le32(flags);
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 3cc3c4a71e32..182e2e879ecf 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -2396,7 +2396,7 @@ smb2_is_status_io_timeout(char *buf)
return false;
}
-static void
+static bool
smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
{
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
@@ -2405,7 +2405,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
struct cifs_tcon *tcon;
if (shdr->Status != STATUS_NETWORK_NAME_DELETED)
- return;
+ return false;
/* If server is a channel, select the primary channel */
pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
@@ -2420,11 +2420,13 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
spin_unlock(&cifs_tcp_ses_lock);
pr_warn_once("Server share %s deleted.\n",
tcon->tree_name);
- return;
+ return true;
}
}
}
spin_unlock(&cifs_tcp_ses_lock);
+
+ return false;
}
static int
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index e04766fe6f80..a457f07f820d 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -242,7 +242,7 @@ again:
}
spin_unlock(&server->srv_lock);
- nls_codepage = load_nls_default();
+ nls_codepage = ses->local_nls;
/*
* need to prevent multiple threads trying to simultaneously
@@ -324,7 +324,6 @@ out:
rc = -EAGAIN;
}
failed:
- unload_nls(nls_codepage);
return rc;
}
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index c6db898dab7c..7676091b3e77 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -160,7 +160,7 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id)
spin_unlock(&ses->ses_lock);
continue;
}
- ++ses->ses_count;
+ cifs_smb_ses_inc_refcount(ses);
spin_unlock(&ses->ses_lock);
return ses;
}
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index fb8b2d566efb..b7521e41402e 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -352,7 +352,8 @@ enum KSMBD_TREE_CONN_STATUS {
#define KSMBD_SHARE_FLAG_STREAMS BIT(11)
#define KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS BIT(12)
#define KSMBD_SHARE_FLAG_ACL_XATTR BIT(13)
-#define KSMBD_SHARE_FLAG_UPDATE BIT(14)
+#define KSMBD_SHARE_FLAG_UPDATE BIT(14)
+#define KSMBD_SHARE_FLAG_CROSSMNT BIT(15)
/*
* Tree connect request flags.
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index ced7a9e916f0..9df121bdf349 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -286,6 +286,7 @@ static void handle_ksmbd_work(struct work_struct *wk)
static int queue_ksmbd_work(struct ksmbd_conn *conn)
{
struct ksmbd_work *work;
+ int err;
work = ksmbd_alloc_work_struct();
if (!work) {
@@ -297,7 +298,11 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn)
work->request_buf = conn->request_buf;
conn->request_buf = NULL;
- ksmbd_init_smb_server(work);
+ err = ksmbd_init_smb_server(work);
+ if (err) {
+ ksmbd_free_work_struct(work);
+ return 0;
+ }
ksmbd_conn_enqueue_request(work);
atomic_inc(&conn->r_count);
diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c
index 33b7e6c4ceff..e881df1d10cb 100644
--- a/fs/smb/server/smb2misc.c
+++ b/fs/smb/server/smb2misc.c
@@ -380,13 +380,13 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
}
if (smb2_req_struct_sizes[command] != pdu->StructureSize2) {
- if (command == SMB2_OPLOCK_BREAK_HE &&
- le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 &&
- le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) {
+ if (!(command == SMB2_OPLOCK_BREAK_HE &&
+ (le16_to_cpu(pdu->StructureSize2) == OP_BREAK_STRUCT_SIZE_20 ||
+ le16_to_cpu(pdu->StructureSize2) == OP_BREAK_STRUCT_SIZE_21))) {
/* special case for SMB2.1 lease break message */
ksmbd_debug(SMB,
- "Illegal request size %d for oplock break\n",
- le16_to_cpu(pdu->StructureSize2));
+ "Illegal request size %u for command %d\n",
+ le16_to_cpu(pdu->StructureSize2), command);
return 1;
}
}
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 2a084d35233a..a947c18915c2 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -87,9 +87,9 @@ struct channel *lookup_chann_list(struct ksmbd_session *sess, struct ksmbd_conn
*/
int smb2_get_ksmbd_tcon(struct ksmbd_work *work)
{
- struct smb2_hdr *req_hdr = smb2_get_msg(work->request_buf);
+ struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work);
unsigned int cmd = le16_to_cpu(req_hdr->Command);
- int tree_id;
+ unsigned int tree_id;
if (cmd == SMB2_TREE_CONNECT_HE ||
cmd == SMB2_CANCEL_HE ||
@@ -114,7 +114,7 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work)
pr_err("The first operation in the compound does not have tcon\n");
return -EINVAL;
}
- if (work->tcon->id != tree_id) {
+ if (tree_id != UINT_MAX && work->tcon->id != tree_id) {
pr_err("tree id(%u) is different with id(%u) in first operation\n",
tree_id, work->tcon->id);
return -EINVAL;
@@ -559,9 +559,9 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work)
*/
int smb2_check_user_session(struct ksmbd_work *work)
{
- struct smb2_hdr *req_hdr = smb2_get_msg(work->request_buf);
+ struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work);
struct ksmbd_conn *conn = work->conn;
- unsigned int cmd = conn->ops->get_cmd_val(work);
+ unsigned int cmd = le16_to_cpu(req_hdr->Command);
unsigned long long sess_id;
/*
@@ -587,7 +587,7 @@ int smb2_check_user_session(struct ksmbd_work *work)
pr_err("The first operation in the compound does not have sess\n");
return -EINVAL;
}
- if (work->sess->id != sess_id) {
+ if (sess_id != ULLONG_MAX && work->sess->id != sess_id) {
pr_err("session id(%llu) is different with the first operation(%lld)\n",
sess_id, work->sess->id);
return -EINVAL;
@@ -2324,9 +2324,16 @@ next:
break;
buf_len -= next;
eabuf = (struct smb2_ea_info *)((char *)eabuf + next);
- if (next < (u32)eabuf->EaNameLength + le16_to_cpu(eabuf->EaValueLength))
+ if (buf_len < sizeof(struct smb2_ea_info)) {
+ rc = -EINVAL;
break;
+ }
+ if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength +
+ le16_to_cpu(eabuf->EaValueLength)) {
+ rc = -EINVAL;
+ break;
+ }
} while (next != 0);
kfree(attr_name);
@@ -2467,8 +2474,9 @@ static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon,
}
}
-static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name,
- int open_flags, umode_t posix_mode, bool is_dir)
+static int smb2_creat(struct ksmbd_work *work, struct path *parent_path,
+ struct path *path, char *name, int open_flags,
+ umode_t posix_mode, bool is_dir)
{
struct ksmbd_tree_connect *tcon = work->tcon;
struct ksmbd_share_config *share = tcon->share_conf;
@@ -2495,7 +2503,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name,
return rc;
}
- rc = ksmbd_vfs_kern_path_locked(work, name, 0, path, 0);
+ rc = ksmbd_vfs_kern_path_locked(work, name, 0, parent_path, path, 0);
if (rc) {
pr_err("cannot get linux path (%s), err = %d\n",
name, rc);
@@ -2565,7 +2573,7 @@ int smb2_open(struct ksmbd_work *work)
struct ksmbd_tree_connect *tcon = work->tcon;
struct smb2_create_req *req;
struct smb2_create_rsp *rsp;
- struct path path;
+ struct path path, parent_path;
struct ksmbd_share_config *share = tcon->share_conf;
struct ksmbd_file *fp = NULL;
struct file *filp = NULL;
@@ -2786,7 +2794,8 @@ int smb2_open(struct ksmbd_work *work)
goto err_out1;
}
- rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS, &path, 1);
+ rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS,
+ &parent_path, &path, 1);
if (!rc) {
file_present = true;
@@ -2906,7 +2915,8 @@ int smb2_open(struct ksmbd_work *work)
/*create file if not present */
if (!file_present) {
- rc = smb2_creat(work, &path, name, open_flags, posix_mode,
+ rc = smb2_creat(work, &parent_path, &path, name, open_flags,
+ posix_mode,
req->CreateOptions & FILE_DIRECTORY_FILE_LE);
if (rc) {
if (rc == -ENOENT) {
@@ -3321,8 +3331,9 @@ int smb2_open(struct ksmbd_work *work)
err_out:
if (file_present || created) {
- inode_unlock(d_inode(path.dentry->d_parent));
- dput(path.dentry);
+ inode_unlock(d_inode(parent_path.dentry));
+ path_put(&path);
+ path_put(&parent_path);
}
ksmbd_revert_fsids(work);
err_out1:
@@ -5545,7 +5556,7 @@ static int smb2_create_link(struct ksmbd_work *work,
struct nls_table *local_nls)
{
char *link_name = NULL, *target_name = NULL, *pathname = NULL;
- struct path path;
+ struct path path, parent_path;
bool file_present = false;
int rc;
@@ -5575,7 +5586,7 @@ static int smb2_create_link(struct ksmbd_work *work,
ksmbd_debug(SMB, "target name is %s\n", target_name);
rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS,
- &path, 0);
+ &parent_path, &path, 0);
if (rc) {
if (rc != -ENOENT)
goto out;
@@ -5605,8 +5616,9 @@ static int smb2_create_link(struct ksmbd_work *work,
rc = -EINVAL;
out:
if (file_present) {
- inode_unlock(d_inode(path.dentry->d_parent));
+ inode_unlock(d_inode(parent_path.dentry));
path_put(&path);
+ path_put(&parent_path);
}
if (!IS_ERR(link_name))
kfree(link_name);
@@ -6209,6 +6221,11 @@ int smb2_read(struct ksmbd_work *work)
unsigned int max_read_size = conn->vals->max_read_size;
WORK_BUFFERS(work, req, rsp);
+ if (work->next_smb2_rcv_hdr_off) {
+ work->send_no_response = 1;
+ err = -EOPNOTSUPP;
+ goto out;
+ }
if (test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_PIPE)) {
@@ -8609,7 +8626,8 @@ int smb3_decrypt_req(struct ksmbd_work *work)
struct smb2_transform_hdr *tr_hdr = smb2_get_msg(buf);
int rc = 0;
- if (buf_data_size < sizeof(struct smb2_hdr)) {
+ if (pdu_length < sizeof(struct smb2_transform_hdr) ||
+ buf_data_size < sizeof(struct smb2_hdr)) {
pr_err("Transform message is too small (%u)\n",
pdu_length);
return -ECONNABORTED;
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index ef20f63e55e6..c2b75d898852 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -388,26 +388,29 @@ static struct smb_version_cmds smb1_server_cmds[1] = {
[SMB_COM_NEGOTIATE_EX] = { .proc = smb1_negotiate, },
};
-static void init_smb1_server(struct ksmbd_conn *conn)
+static int init_smb1_server(struct ksmbd_conn *conn)
{
conn->ops = &smb1_server_ops;
conn->cmds = smb1_server_cmds;
conn->max_cmds = ARRAY_SIZE(smb1_server_cmds);
+ return 0;
}
-void ksmbd_init_smb_server(struct ksmbd_work *work)
+int ksmbd_init_smb_server(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
__le32 proto;
- if (conn->need_neg == false)
- return;
-
proto = *(__le32 *)((struct smb_hdr *)work->request_buf)->Protocol;
+ if (conn->need_neg == false) {
+ if (proto == SMB1_PROTO_NUMBER)
+ return -EINVAL;
+ return 0;
+ }
+
if (proto == SMB1_PROTO_NUMBER)
- init_smb1_server(conn);
- else
- init_smb3_11_server(conn);
+ return init_smb1_server(conn);
+ return init_smb3_11_server(conn);
}
int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h
index aeca0f46068f..f1092519c0c2 100644
--- a/fs/smb/server/smb_common.h
+++ b/fs/smb/server/smb_common.h
@@ -427,7 +427,7 @@ bool ksmbd_smb_request(struct ksmbd_conn *conn);
int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count);
-void ksmbd_init_smb_server(struct ksmbd_work *work);
+int ksmbd_init_smb_server(struct ksmbd_work *work);
struct ksmbd_kstat;
int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work,
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index d0e94b73931a..d48756a339a5 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -63,13 +63,13 @@ int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child)
static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
char *pathname, unsigned int flags,
+ struct path *parent_path,
struct path *path)
{
struct qstr last;
struct filename *filename;
struct path *root_share_path = &share_conf->vfs_path;
int err, type;
- struct path parent_path;
struct dentry *d;
if (pathname[0] == '\0') {
@@ -84,7 +84,7 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
return PTR_ERR(filename);
err = vfs_path_parent_lookup(filename, flags,
- &parent_path, &last, &type,
+ parent_path, &last, &type,
root_share_path);
if (err) {
putname(filename);
@@ -92,13 +92,13 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
}
if (unlikely(type != LAST_NORM)) {
- path_put(&parent_path);
+ path_put(parent_path);
putname(filename);
return -ENOENT;
}
- inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
- d = lookup_one_qstr_excl(&last, parent_path.dentry, 0);
+ inode_lock_nested(parent_path->dentry->d_inode, I_MUTEX_PARENT);
+ d = lookup_one_qstr_excl(&last, parent_path->dentry, 0);
if (IS_ERR(d))
goto err_out;
@@ -108,15 +108,22 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
}
path->dentry = d;
- path->mnt = share_conf->vfs_path.mnt;
- path_put(&parent_path);
- putname(filename);
+ path->mnt = mntget(parent_path->mnt);
+ if (test_share_config_flag(share_conf, KSMBD_SHARE_FLAG_CROSSMNT)) {
+ err = follow_down(path, 0);
+ if (err < 0) {
+ path_put(path);
+ goto err_out;
+ }
+ }
+
+ putname(filename);
return 0;
err_out:
- inode_unlock(parent_path.dentry->d_inode);
- path_put(&parent_path);
+ inode_unlock(d_inode(parent_path->dentry));
+ path_put(parent_path);
putname(filename);
return -ENOENT;
}
@@ -412,7 +419,8 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,
{
char *stream_buf = NULL, *wbuf;
struct mnt_idmap *idmap = file_mnt_idmap(fp->filp);
- size_t size, v_len;
+ size_t size;
+ ssize_t v_len;
int err = 0;
ksmbd_debug(VFS, "write stream data pos : %llu, count : %zd\n",
@@ -429,9 +437,9 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,
fp->stream.name,
fp->stream.size,
&stream_buf);
- if ((int)v_len < 0) {
+ if (v_len < 0) {
pr_err("not found stream in xattr : %zd\n", v_len);
- err = (int)v_len;
+ err = v_len;
goto out;
}
@@ -1194,14 +1202,14 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
* Return: 0 on success, otherwise error
*/
int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
- unsigned int flags, struct path *path,
- bool caseless)
+ unsigned int flags, struct path *parent_path,
+ struct path *path, bool caseless)
{
struct ksmbd_share_config *share_conf = work->tcon->share_conf;
int err;
- struct path parent_path;
- err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, path);
+ err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, parent_path,
+ path);
if (!err)
return 0;
@@ -1216,10 +1224,10 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
path_len = strlen(filepath);
remain_len = path_len;
- parent_path = share_conf->vfs_path;
- path_get(&parent_path);
+ *parent_path = share_conf->vfs_path;
+ path_get(parent_path);
- while (d_can_lookup(parent_path.dentry)) {
+ while (d_can_lookup(parent_path->dentry)) {
char *filename = filepath + path_len - remain_len;
char *next = strchrnul(filename, '/');
size_t filename_len = next - filename;
@@ -1228,7 +1236,7 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
if (filename_len == 0)
break;
- err = ksmbd_vfs_lookup_in_dir(&parent_path, filename,
+ err = ksmbd_vfs_lookup_in_dir(parent_path, filename,
filename_len,
work->conn->um);
if (err)
@@ -1245,8 +1253,8 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
goto out2;
else if (is_last)
goto out1;
- path_put(&parent_path);
- parent_path = *path;
+ path_put(parent_path);
+ *parent_path = *path;
next[0] = '/';
remain_len -= filename_len + 1;
@@ -1254,16 +1262,17 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
err = -EINVAL;
out2:
- path_put(&parent_path);
+ path_put(parent_path);
out1:
kfree(filepath);
}
if (!err) {
- err = ksmbd_vfs_lock_parent(parent_path.dentry, path->dentry);
- if (err)
- dput(path->dentry);
- path_put(&parent_path);
+ err = ksmbd_vfs_lock_parent(parent_path->dentry, path->dentry);
+ if (err) {
+ path_put(path);
+ path_put(parent_path);
+ }
}
return err;
}
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index 80039312c255..72f9fb4b48d1 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -115,8 +115,8 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
const struct path *path, char *attr_name);
int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
- unsigned int flags, struct path *path,
- bool caseless);
+ unsigned int flags, struct path *parent_path,
+ struct path *path, bool caseless);
struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
const char *name,
unsigned int flags,
diff --git a/fs/splice.c b/fs/splice.c
index 004eb1c4ce31..3e2a31e1ce6a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -876,6 +876,8 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
msg.msg_flags |= MSG_MORE;
if (remain && pipe_occupancy(pipe->head, tail) > 0)
msg.msg_flags |= MSG_MORE;
+ if (out->f_flags & O_NONBLOCK)
+ msg.msg_flags |= MSG_DONTWAIT;
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
len - remain);
diff --git a/fs/super.c b/fs/super.c
index e781226e2880..e04a403dbffd 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -546,17 +546,31 @@ bool mount_capable(struct fs_context *fc)
* @test: Comparison callback
* @set: Setup callback
*
- * Find or create a superblock using the parameters stored in the filesystem
- * context and the two callback functions.
+ * Create a new superblock or find an existing one.
*
- * If an extant superblock is matched, then that will be returned with an
- * elevated reference count that the caller must transfer or discard.
+ * The @test callback is used to find a matching existing superblock.
+ * Whether or not the requested parameters in @fc are taken into account
+ * is specific to the @test callback that is used. They may even be
+ * completely ignored.
+ *
+ * If an extant superblock is matched, it will be returned unless:
+ *
+ * (1) the namespace the filesystem context @fc and the extant
+ * superblock's namespace differ
+ *
+ * (2) the filesystem context @fc has requested that reusing an extant
+ * superblock is not allowed
+ *
+ * In both cases EBUSY will be returned.
*
* If no match is made, a new superblock will be allocated and basic
- * initialisation will be performed (s_type, s_fs_info and s_id will be set and
- * the set() callback will be invoked), the superblock will be published and it
- * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE
- * as yet unset.
+ * initialisation will be performed (s_type, s_fs_info and s_id will be
+ * set and the @set callback will be invoked), the superblock will be
+ * published and it will be returned in a partially constructed state
+ * with SB_BORN and SB_ACTIVE as yet unset.
+ *
+ * Return: On success, an extant or newly created superblock is
+ * returned. On failure an error pointer is returned.
*/
struct super_block *sget_fc(struct fs_context *fc,
int (*test)(struct super_block *, struct fs_context *),
@@ -603,9 +617,13 @@ retry:
return s;
share_extant_sb:
- if (user_ns != old->s_user_ns) {
+ if (user_ns != old->s_user_ns || fc->exclusive) {
spin_unlock(&sb_lock);
destroy_unused_super(s);
+ if (fc->exclusive)
+ warnfc(fc, "reusing existing filesystem not allowed");
+ else
+ warnfc(fc, "reusing existing filesystem in another namespace not allowed");
return ERR_PTR(-EBUSY);
}
if (!grab_super(old))
@@ -1136,7 +1154,7 @@ static int test_single_super(struct super_block *s, struct fs_context *fc)
return 1;
}
-static int vfs_get_super(struct fs_context *fc, bool reconf,
+static int vfs_get_super(struct fs_context *fc,
int (*test)(struct super_block *, struct fs_context *),
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
@@ -1154,19 +1172,9 @@ static int vfs_get_super(struct fs_context *fc, bool reconf,
goto error;
sb->s_flags |= SB_ACTIVE;
- fc->root = dget(sb->s_root);
- } else {
- fc->root = dget(sb->s_root);
- if (reconf) {
- err = reconfigure_super(fc);
- if (err < 0) {
- dput(fc->root);
- fc->root = NULL;
- goto error;
- }
- }
}
+ fc->root = dget(sb->s_root);
return 0;
error:
@@ -1178,7 +1186,7 @@ int get_tree_nodev(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, false, NULL, fill_super);
+ return vfs_get_super(fc, NULL, fill_super);
}
EXPORT_SYMBOL(get_tree_nodev);
@@ -1186,25 +1194,17 @@ int get_tree_single(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, false, test_single_super, fill_super);
+ return vfs_get_super(fc, test_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single);
-int get_tree_single_reconf(struct fs_context *fc,
- int (*fill_super)(struct super_block *sb,
- struct fs_context *fc))
-{
- return vfs_get_super(fc, true, test_single_super, fill_super);
-}
-EXPORT_SYMBOL(get_tree_single_reconf);
-
int get_tree_keyed(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc),
void *key)
{
fc->s_fs_info = key;
- return vfs_get_super(fc, false, test_keyed_super, fill_super);
+ return vfs_get_super(fc, test_keyed_super, fill_super);
}
EXPORT_SYMBOL(get_tree_keyed);
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index 075f15c43c78..5f1a14d5b927 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -179,9 +179,10 @@ static int vboxsf_dir_iterate(struct file *dir, struct dir_context *ctx)
return 0;
}
+WRAP_DIR_ITER(vboxsf_dir_iterate) // FIXME!
const struct file_operations vboxsf_dir_fops = {
.open = vboxsf_dir_open,
- .iterate = vboxsf_dir_iterate,
+ .iterate_shared = shared_vboxsf_dir_iterate,
.release = vboxsf_dir_release,
.read = generic_read_dir,
.llseek = generic_file_llseek,
diff --git a/fs/vboxsf/shfl_hostintf.h b/fs/vboxsf/shfl_hostintf.h
index aca829062c12..069a019c9247 100644
--- a/fs/vboxsf/shfl_hostintf.h
+++ b/fs/vboxsf/shfl_hostintf.h
@@ -68,9 +68,9 @@ struct shfl_string {
/** UTF-8 or UTF-16 string. Nul terminated. */
union {
- u8 utf8[2];
- u16 utf16[1];
- u16 ucs2[1]; /* misnomer, use utf16. */
+ u8 legacy_padding[2];
+ DECLARE_FLEX_ARRAY(u8, utf8);
+ DECLARE_FLEX_ARRAY(u16, utf16);
} string;
};
VMMDEV_ASSERT_SIZE(shfl_string, 6);
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 25e2841084e1..f9015f88eca7 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -591,7 +591,7 @@ struct xfs_attr_shortform {
uint8_t valuelen; /* actual length of value (no NULL) */
uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
uint8_t nameval[]; /* name & value bytes concatenated */
- } list[1]; /* variable sized array */
+ } list[]; /* variable sized array */
};
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
@@ -620,19 +620,29 @@ typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
typedef struct xfs_attr_leaf_name_local {
__be16 valuelen; /* number of bytes in value */
__u8 namelen; /* length of name bytes */
- __u8 nameval[1]; /* name/value bytes */
+ /*
+ * In Linux 6.5 this flex array was converted from nameval[1] to
+ * nameval[]. Be very careful here about extra padding at the end;
+ * see xfs_attr_leaf_entsize_local() for details.
+ */
+ __u8 nameval[]; /* name/value bytes */
} xfs_attr_leaf_name_local_t;
typedef struct xfs_attr_leaf_name_remote {
__be32 valueblk; /* block number of value bytes */
__be32 valuelen; /* number of bytes in value */
__u8 namelen; /* length of name bytes */
- __u8 name[1]; /* name bytes */
+ /*
+ * In Linux 6.5 this flex array was converted from name[1] to name[].
+ * Be very careful here about extra padding at the end; see
+ * xfs_attr_leaf_entsize_remote() for details.
+ */
+ __u8 name[]; /* name bytes */
} xfs_attr_leaf_name_remote_t;
typedef struct xfs_attr_leafblock {
xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
- xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
+ xfs_attr_leaf_entry_t entries[]; /* sorted on key, not name */
/*
* The rest of the block contains the following structures after the
* leaf entries, growing from the bottom up. The variables are never
@@ -664,7 +674,7 @@ struct xfs_attr3_leaf_hdr {
struct xfs_attr3_leafblock {
struct xfs_attr3_leaf_hdr hdr;
- struct xfs_attr_leaf_entry entries[1];
+ struct xfs_attr_leaf_entry entries[];
/*
* The rest of the block contains the following structures after the
@@ -747,14 +757,61 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
*/
static inline int xfs_attr_leaf_entsize_remote(int nlen)
{
- return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 +
- nlen, XFS_ATTR_LEAF_NAME_ALIGN);
+ /*
+ * Prior to Linux 6.5, struct xfs_attr_leaf_name_remote ended with
+ * name[1], which was used as a flexarray. The layout of this struct
+ * is 9 bytes of fixed-length fields followed by a __u8 flex array at
+ * offset 9.
+ *
+ * On most architectures, struct xfs_attr_leaf_name_remote had two
+ * bytes of implicit padding at the end of the struct to make the
+ * struct length 12. After converting name[1] to name[], there are
+ * three implicit padding bytes and the struct size remains 12.
+ * However, there are compiler configurations that do not add implicit
+ * padding at all (m68k) and have been broken for years.
+ *
+ * This entsize computation historically added (the xattr name length)
+ * to (the padded struct length - 1) and rounded that sum up to the
+ * nearest multiple of 4 (NAME_ALIGN). IOWs, round_up(11 + nlen, 4).
+ * This is encoded in the ondisk format, so we cannot change this.
+ *
+ * Compute the entsize from offsetof of the flexarray and manually
+ * adding bytes for the implicit padding.
+ */
+ const size_t remotesize =
+ offsetof(struct xfs_attr_leaf_name_remote, name) + 2;
+
+ return round_up(remotesize + nlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
{
- return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 +
- nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN);
+ /*
+ * Prior to Linux 6.5, struct xfs_attr_leaf_name_local ended with
+ * nameval[1], which was used as a flexarray. The layout of this
+ * struct is 3 bytes of fixed-length fields followed by a __u8 flex
+ * array at offset 3.
+ *
+ * struct xfs_attr_leaf_name_local had zero bytes of implicit padding
+ * at the end of the struct to make the struct length 4. On most
+ * architectures, after converting nameval[1] to nameval[], there is
+ * one implicit padding byte and the struct size remains 4. However,
+ * there are compiler configurations that do not add implicit padding
+ * at all (m68k) and would break.
+ *
+ * This entsize computation historically added (the xattr name and
+ * value length) to (the padded struct length - 1) and rounded that sum
+ * up to the nearest multiple of 4 (NAME_ALIGN). IOWs, the formula is
+ * round_up(3 + nlen + vlen, 4). This is encoded in the ondisk format,
+ * so we cannot change this.
+ *
+ * Compute the entsize from offsetof of the flexarray and manually
+ * adding bytes for the implicit padding.
+ */
+ const size_t localsize =
+ offsetof(struct xfs_attr_leaf_name_local, nameval);
+
+ return round_up(localsize + nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local_max(int bsize)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 9c60ebb328b4..2cbf9ea39b8c 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -592,12 +592,12 @@ typedef struct xfs_attrlist_cursor {
struct xfs_attrlist {
__s32 al_count; /* number of entries in attrlist */
__s32 al_more; /* T/F: more attrs (do call again) */
- __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */
+ __s32 al_offset[]; /* byte offsets of attrs [var-sized] */
};
struct xfs_attrlist_ent { /* data from attr_list() */
__u32 a_valuelen; /* number bytes in value of attr */
- char a_name[1]; /* attr name (NULL terminated) */
+ char a_name[]; /* attr name (NULL terminated) */
};
typedef struct xfs_fsop_attrlist_handlereq {
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 9737b5a9f405..c4cc99b70dd3 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -56,7 +56,7 @@ xfs_check_ondisk_structs(void)
/* dir/attr trees */
XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80);
- XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 88);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 80);
XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode, 64);
@@ -88,7 +88,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4);
XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0);
XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2);
XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4);
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 92c9aaae3663..789cfb74c146 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -341,77 +341,6 @@ static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek_size(file, offset, whence, isize, isize);
}
-struct zonefs_zone_append_bio {
- /* The target inode of the BIO */
- struct inode *inode;
-
- /* For sync writes, the target append write offset */
- u64 append_offset;
-
- /*
- * This member must come last, bio_alloc_bioset will allocate enough
- * bytes for entire zonefs_bio but relies on bio being last.
- */
- struct bio bio;
-};
-
-static inline struct zonefs_zone_append_bio *
-zonefs_zone_append_bio(struct bio *bio)
-{
- return container_of(bio, struct zonefs_zone_append_bio, bio);
-}
-
-static void zonefs_file_zone_append_dio_bio_end_io(struct bio *bio)
-{
- struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio);
- struct zonefs_zone *z = zonefs_inode_zone(za_bio->inode);
- sector_t za_sector;
-
- if (bio->bi_status != BLK_STS_OK)
- goto bio_end;
-
- /*
- * If the file zone was written underneath the file system, the zone
- * append operation can still succedd (if the zone is not full) but
- * the write append location will not be where we expect it to be.
- * Check that we wrote where we intended to, that is, at z->z_wpoffset.
- */
- za_sector = z->z_sector + (za_bio->append_offset >> SECTOR_SHIFT);
- if (bio->bi_iter.bi_sector != za_sector) {
- zonefs_warn(za_bio->inode->i_sb,
- "Invalid write sector %llu for zone at %llu\n",
- bio->bi_iter.bi_sector, z->z_sector);
- bio->bi_status = BLK_STS_IOERR;
- }
-
-bio_end:
- iomap_dio_bio_end_io(bio);
-}
-
-static void zonefs_file_zone_append_dio_submit_io(const struct iomap_iter *iter,
- struct bio *bio,
- loff_t file_offset)
-{
- struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio);
- struct inode *inode = iter->inode;
- struct zonefs_zone *z = zonefs_inode_zone(inode);
-
- /*
- * Issue a zone append BIO to process sync dio writes. The append
- * file offset is saved to check the zone append write location
- * on completion of the BIO.
- */
- za_bio->inode = inode;
- za_bio->append_offset = file_offset;
-
- bio->bi_opf &= ~REQ_OP_WRITE;
- bio->bi_opf |= REQ_OP_ZONE_APPEND;
- bio->bi_iter.bi_sector = z->z_sector;
- bio->bi_end_io = zonefs_file_zone_append_dio_bio_end_io;
-
- submit_bio(bio);
-}
-
static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
int error, unsigned int flags)
{
@@ -442,14 +371,6 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
return 0;
}
-static struct bio_set zonefs_zone_append_bio_set;
-
-static const struct iomap_dio_ops zonefs_zone_append_dio_ops = {
- .submit_io = zonefs_file_zone_append_dio_submit_io,
- .end_io = zonefs_file_write_dio_end_io,
- .bio_set = &zonefs_zone_append_bio_set,
-};
-
static const struct iomap_dio_ops zonefs_write_dio_ops = {
.end_io = zonefs_file_write_dio_end_io,
};
@@ -533,9 +454,6 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
- const struct iomap_dio_ops *dio_ops;
- bool sync = is_sync_kiocb(iocb);
- bool append = false;
ssize_t ret, count;
/*
@@ -543,7 +461,8 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
* as this can cause write reordering (e.g. the first aio gets EAGAIN
* on the inode lock but the second goes through but is now unaligned).
*/
- if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT))
+ if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) &&
+ (iocb->ki_flags & IOCB_NOWAIT))
return -EOPNOTSUPP;
if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -573,18 +492,6 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
goto inode_unlock;
}
mutex_unlock(&zi->i_truncate_mutex);
- append = sync;
- }
-
- if (append) {
- unsigned int max = bdev_max_zone_append_sectors(sb->s_bdev);
-
- max = ALIGN_DOWN(max << SECTOR_SHIFT, sb->s_blocksize);
- iov_iter_truncate(from, max);
-
- dio_ops = &zonefs_zone_append_dio_ops;
- } else {
- dio_ops = &zonefs_write_dio_ops;
}
/*
@@ -593,7 +500,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
* the user can make sense of the error.
*/
ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
- dio_ops, 0, NULL, 0);
+ &zonefs_write_dio_ops, 0, NULL, 0);
if (ret == -ENOTBLK)
ret = -EBUSY;
@@ -938,15 +845,3 @@ const struct file_operations zonefs_file_operations = {
.splice_write = iter_file_splice_write,
.iopoll = iocb_bio_iopoll,
};
-
-int zonefs_file_bioset_init(void)
-{
- return bioset_init(&zonefs_zone_append_bio_set, BIO_POOL_SIZE,
- offsetof(struct zonefs_zone_append_bio, bio),
- BIOSET_NEED_BVECS);
-}
-
-void zonefs_file_bioset_exit(void)
-{
- bioset_exit(&zonefs_zone_append_bio_set);
-}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index eaaa23cf6d93..9d1a9808fbbb 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1414,13 +1414,9 @@ static int __init zonefs_init(void)
BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE);
- ret = zonefs_file_bioset_init();
- if (ret)
- return ret;
-
ret = zonefs_init_inodecache();
if (ret)
- goto destroy_bioset;
+ return ret;
ret = zonefs_sysfs_init();
if (ret)
@@ -1436,8 +1432,6 @@ sysfs_exit:
zonefs_sysfs_exit();
destroy_inodecache:
zonefs_destroy_inodecache();
-destroy_bioset:
- zonefs_file_bioset_exit();
return ret;
}
@@ -1447,7 +1441,6 @@ static void __exit zonefs_exit(void)
unregister_filesystem(&zonefs_type);
zonefs_sysfs_exit();
zonefs_destroy_inodecache();
- zonefs_file_bioset_exit();
}
MODULE_AUTHOR("Damien Le Moal");
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index f663b8ebc2cb..8175652241b5 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -279,8 +279,6 @@ extern const struct file_operations zonefs_dir_operations;
extern const struct address_space_operations zonefs_file_aops;
extern const struct file_operations zonefs_file_operations;
int zonefs_file_truncate(struct inode *inode, loff_t isize);
-int zonefs_file_bioset_init(void);
-void zonefs_file_bioset_exit(void);
/* In sysfs.c */
int zonefs_sysfs_register(struct super_block *sb);