diff options
Diffstat (limited to 'fs')
677 files changed, 18163 insertions, 16604 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index c397c51f80d9..eed551d8555f 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -139,7 +139,7 @@ struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu } -struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *v9fs_iop_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { struct v9fs_session_info *v9ses; @@ -151,7 +151,7 @@ struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, return v9fs_get_cached_acl(d_inode(dentry), type); } -int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int v9fs_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int retval; @@ -195,7 +195,7 @@ int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, goto err_out; } - if (!inode_owner_or_capable(&init_user_ns, inode)) { + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) { retval = -EPERM; goto err_out; } @@ -206,7 +206,7 @@ int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr iattr = {}; struct posix_acl *acl_mode = acl; - retval = posix_acl_update_mode(&init_user_ns, inode, + retval = posix_acl_update_mode(&nop_mnt_idmap, inode, &iattr.ia_mode, &acl_mode); if (retval) @@ -225,7 +225,7 @@ int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * FIXME should we update ctime ? * What is the following setxattr update the mode ? */ - v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr); + v9fs_vfs_setattr_dotl(&nop_mnt_idmap, dentry, &iattr); } break; case ACL_TYPE_DEFAULT: diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 4c60a2bce5de..333cfcc281da 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -10,9 +10,9 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid); struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu); -struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *v9fs_iop_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type); -int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int v9fs_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid); int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid, diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 3a9c4517265f..61a51b90600d 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -468,7 +468,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, #ifdef CONFIG_9P_FSCACHE /* register the session for caching */ - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + if (v9ses->cache == CACHE_FSCACHE) { rc = v9fs_cache_session_get_cookie(v9ses, dev_name); if (rc < 0) goto err_clnt; diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 6acabc2e7dc9..f3f74d197b5d 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -151,7 +151,7 @@ extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); -extern int v9fs_vfs_rename(struct user_namespace *mnt_userns, +extern int v9fs_vfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index bc417da7e9c1..75106b9f293d 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -60,7 +60,7 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); int v9fs_uflags2omode(int uflags, int extended); void v9fs_blank_wstat(struct p9_wstat *wstat); -int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, +int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, int datasync); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 97599edbc300..6f46d7e4c750 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -279,8 +279,6 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - BUG_ON(!v9inode->writeback_fid); - /* Prefetch area to be written into the cache if we're caching this * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 59b0e8948f78..3d74b04fe0de 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -197,7 +197,7 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) /** - * v9fs_dir_release - close a directory + * v9fs_dir_release - called on a close of a file or directory * @inode: inode of the directory * @filp: file pointer to a directory * @@ -209,6 +209,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) struct p9_fid *fid; __le32 version; loff_t i_size; + int retval = 0; fid = filp->private_data; p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", @@ -217,7 +218,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) spin_lock(&inode->i_lock); hlist_del(&fid->ilist); spin_unlock(&inode->i_lock); - p9_fid_put(fid); + retval = p9_fid_put(fid); } if ((filp->f_mode & FMODE_WRITE)) { @@ -228,7 +229,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) } else { fscache_unuse_cookie(v9fs_inode_cookie(v9inode), NULL, NULL); } - return 0; + return retval; } const struct file_operations v9fs_dir_operations = { diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index b740017634ef..44c15eb2b908 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/stat.h> @@ -73,8 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) } mutex_lock(&v9inode->v_mutex); - if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && - !v9inode->writeback_fid && + if ((v9ses->cache) && !v9inode->writeback_fid && ((file->f_flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -92,9 +92,11 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9inode->writeback_fid = (void *) writeback_fid; } mutex_unlock(&v9inode->v_mutex); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache == CACHE_FSCACHE) fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); +#endif v9fs_open_fid_add(inode, &fid); return 0; out_error: diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 27a04a226d97..1d523bec0a94 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, { int err = 0; - inode_init_owner(&init_user_ns, inode, NULL, mode); + inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = rdev; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); @@ -672,7 +672,7 @@ error: /** * v9fs_vfs_create - VFS hook to create a regular file - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dir: The parent directory * @dentry: The name of file to be created * @mode: The UNIX file mode to set @@ -684,7 +684,7 @@ error: */ static int -v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); @@ -704,14 +704,14 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, /** * v9fs_vfs_mkdir - VFS mkdir hook to create a directory - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @mode: mode for new directory * */ -static int v9fs_vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int err; @@ -843,8 +843,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, inode = d_inode(dentry); v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); - if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && - !v9inode->writeback_fid && + if ((v9ses->cache) && !v9inode->writeback_fid && ((flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -908,7 +907,7 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) /** * v9fs_vfs_rename - VFS hook to rename an inode - * @mnt_userns: The user namespace of the mount + * @idmap: The idmap of the mount * @old_dir: old dir inode * @old_dentry: old dentry * @new_dir: new dir inode @@ -918,7 +917,7 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) */ int -v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +v9fs_vfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -1018,7 +1017,7 @@ error: /** * v9fs_vfs_getattr - retrieve file metadata - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @path: Object to query * @stat: metadata structure to populate * @request_mask: Mask of STATX_xxx flags indicating the caller's interests @@ -1027,7 +1026,7 @@ error: */ static int -v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, +v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -1038,7 +1037,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -1051,7 +1050,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, return PTR_ERR(st); v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); p9stat_free(st); kfree(st); @@ -1060,13 +1059,13 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, /** * v9fs_vfs_setattr - set file metadata - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * */ -static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, +static int v9fs_vfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; @@ -1077,7 +1076,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, struct p9_wstat wstat; p9_debug(P9_DEBUG_VFS, "\n"); - retval = setattr_prepare(&init_user_ns, dentry, iattr); + retval = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (retval) return retval; @@ -1135,7 +1134,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, v9fs_invalidate_inode_attr(inode); - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); return 0; } @@ -1300,7 +1299,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, /** * v9fs_vfs_symlink - helper function to create symlinks - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dir: directory inode containing symlink * @dentry: dentry for symlink * @symname: symlink data @@ -1310,7 +1309,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, */ static int -v9fs_vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n", @@ -1356,7 +1355,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod - create a special file - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dir: inode destination for new link * @dentry: dentry for file * @mode: mode for creation @@ -1365,7 +1364,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, */ static int -v9fs_vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index f806b3f11649..331ed60d8fcb 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -30,7 +30,7 @@ #include "acl.h" static int -v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** @@ -211,7 +211,7 @@ int v9fs_open_to_dotl_flags(int flags) /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. - * @mnt_userns: The user namespace of the mount + * @idmap: The user namespace of the mount * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @omode: create permissions @@ -219,10 +219,10 @@ int v9fs_open_to_dotl_flags(int flags) * */ static int -v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_create_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, bool excl) { - return v9fs_vfs_mknod_dotl(mnt_userns, dir, dentry, omode, 0); + return v9fs_vfs_mknod_dotl(idmap, dir, dentry, omode, 0); } static int @@ -316,8 +316,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); - if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && - !v9inode->writeback_fid && + if ((v9ses->cache) && !v9inode->writeback_fid && ((flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -340,9 +339,11 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err) goto out; file->private_data = ofid; - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache == CACHE_FSCACHE) fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); +#endif v9fs_open_fid_add(inode, &ofid); file->f_mode |= FMODE_CREATED; out: @@ -356,14 +357,14 @@ out: /** * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory - * @mnt_userns: The user namespace of the mount + * @idmap: The idmap of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @omode: mode for new directory * */ -static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns, +static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode) { @@ -450,7 +451,7 @@ error: } static int -v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns, +v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -462,7 +463,7 @@ v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -479,7 +480,7 @@ v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns, return PTR_ERR(st); v9fs_stat2inode_dotl(st, d_inode(dentry), 0); - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -529,13 +530,13 @@ static int v9fs_mapped_iattr_valid(int iattr_valid) /** * v9fs_vfs_setattr_dotl - set file metadata - * @mnt_userns: The user namespace of the mount + * @idmap: idmap of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * */ -int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, +int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; @@ -548,7 +549,7 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, p9_debug(P9_DEBUG_VFS, "\n"); - retval = setattr_prepare(&init_user_ns, dentry, iattr); + retval = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (retval) return retval; @@ -597,7 +598,7 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, truncate_setsize(inode, iattr->ia_size); v9fs_invalidate_inode_attr(inode); - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { /* We also want to update ACL when we update mode bits */ @@ -687,7 +688,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, } static int -v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int err; @@ -817,7 +818,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod_dotl - create a special file - * @mnt_userns: The user namespace of the mount + * @idmap: The idmap of the mount * @dir: inode destination for new link * @dentry: dentry for file * @omode: mode for creation @@ -825,7 +826,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, * */ static int -v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, +v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev) { int err; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index b6984311e00a..50f7f3f6b55e 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -150,7 +150,7 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler, } static int v9fs_xattr_handler_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/Kconfig b/fs/Kconfig index 2685a4d0d353..e99830c65033 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -18,6 +18,10 @@ config VALIDATE_FS_PARSER config FS_IOMAP bool +# old blockdev_direct_IO implementation. Use iomap for new code instead +config LEGACY_DIRECT_IO + bool + if BLOCK source "fs/ext2/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 4dea17840761..05f89b5c962f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -16,16 +16,17 @@ obj-y := open.o read_write.o file_table.o super.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ - kernel_read_file.o remap_range.o + kernel_read_file.o mnt_idmapping.o remap_range.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o direct-io.o mpage.o +obj-y += buffer.o mpage.o else obj-y += no-block.o endif obj-$(CONFIG_PROC_FS) += proc_namespace.o +obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-y += anon_inodes.o diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 06b7c92343ad..223f0283d20f 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -144,7 +144,7 @@ struct adfs_discmap { /* Inode stuff */ struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); int adfs_write_inode(struct inode *inode, struct writeback_control *wbc); -int adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, +int adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); /* map.c */ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index ee22278b0cfc..c3ac613d0975 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -294,7 +294,7 @@ out: * later. */ int -adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, +adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -302,7 +302,7 @@ adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, unsigned int ia_valid = attr->ia_valid; int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); /* * we can't change the UID or GID of any file - diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index eb9d0ab850cb..962b86374e1c 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -2,6 +2,7 @@ config AFFS_FS tristate "Amiga FFS file system support" depends on BLOCK + select LEGACY_DIRECT_IO help The Fast File System (FFS) is the common file system used on hard disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y diff --git a/fs/affs/affs.h b/fs/affs/affs.h index bfa89e131ead..60685ec76d98 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -167,17 +167,17 @@ extern const struct export_operations affs_export_ops; extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int); extern int affs_unlink(struct inode *dir, struct dentry *dentry); -extern int affs_create(struct user_namespace *mnt_userns, struct inode *dir, +extern int affs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool); -extern int affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +extern int affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); -extern int affs_symlink(struct user_namespace *mnt_userns, +extern int affs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname); -extern int affs_rename2(struct user_namespace *mnt_userns, +extern int affs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); @@ -185,7 +185,7 @@ extern int affs_rename2(struct user_namespace *mnt_userns, /* inode.c */ extern struct inode *affs_new_inode(struct inode *dir); -extern int affs_notify_change(struct user_namespace *mnt_userns, +extern int affs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void affs_evict_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 2352a75bd9d6..27f77a52c5c8 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -216,7 +216,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) } int -affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, +affs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -224,7 +224,7 @@ affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) goto out; @@ -250,7 +250,7 @@ affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, affs_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); if (attr->ia_valid & ATTR_MODE) diff --git a/fs/affs/namei.c b/fs/affs/namei.c index bcab18956b4f..d12ccfd2a83d 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -242,7 +242,7 @@ affs_unlink(struct inode *dir, struct dentry *dentry) } int -affs_create(struct user_namespace *mnt_userns, struct inode *dir, +affs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; @@ -274,7 +274,7 @@ affs_create(struct user_namespace *mnt_userns, struct inode *dir, } int -affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -313,7 +313,7 @@ affs_rmdir(struct inode *dir, struct dentry *dentry) } int -affs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +affs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct super_block *sb = dir->i_sb; @@ -503,7 +503,7 @@ done: return retval; } -int affs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, +int affs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/afs/dir.c b/fs/afs/dir.c index b7c1f8c84b38..82690d1dd49a 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -28,17 +28,17 @@ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, in loff_t fpos, u64 ino, unsigned dtype); static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); -static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, struct dentry *dentry); -static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *content); -static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags); @@ -1332,7 +1332,7 @@ static const struct afs_operation_ops afs_mkdir_operation = { /* * create a directory on an AFS filesystem */ -static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct afs_operation *op; @@ -1630,7 +1630,7 @@ static const struct afs_operation_ops afs_create_operation = { /* * create a regular file on an AFS filesystem */ -static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct afs_operation *op; @@ -1760,7 +1760,7 @@ static const struct afs_operation_ops afs_symlink_operation = { /* * create a symlink in an AFS filesystem */ -static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *content) { struct afs_operation *op; @@ -1897,7 +1897,7 @@ static const struct afs_operation_ops afs_rename_operation = { /* * rename a file in an AFS filesystem and/or move it between directories */ -static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/afs/flock.c b/fs/afs/flock.c index bbcc5afd1576..9c6dea3139f5 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -451,7 +451,7 @@ static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key, */ static int afs_do_setlk(struct file *file, struct file_lock *fl) { - struct inode *inode = locks_inode(file); + struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); enum afs_flock_mode mode = AFS_FS_S(inode->i_sb)->flock_mode; afs_lock_type_t type; @@ -701,7 +701,7 @@ error: */ static int afs_do_unlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); int ret; _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); @@ -721,7 +721,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) */ static int afs_do_getlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct key *key = afs_file_key(file); int ret, lock_count; @@ -763,7 +763,7 @@ error: */ int afs_lock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); enum afs_flock_operation op; int ret; @@ -798,7 +798,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) */ int afs_flock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); enum afs_flock_operation op; int ret; @@ -843,7 +843,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) */ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file)); _enter(""); @@ -861,7 +861,7 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) */ static void afs_fl_release_private(struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file)); _enter(""); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 6d3a3dbe4928..0167e96e5198 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -737,7 +737,7 @@ error_unlock: /* * read the attributes of an inode */ -int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int afs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -761,7 +761,7 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) stat->nlink -= 1; @@ -870,7 +870,7 @@ static const struct afs_operation_ops afs_setattr_operation = { /* * set the attributes of an inode */ -int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { const unsigned int supported = diff --git a/fs/afs/internal.h b/fs/afs/internal.h index fd8567b98e2b..ad8523d0d038 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/pagemap.h> #include <linux/rxrpc.h> #include <linux/key.h> @@ -1170,9 +1171,9 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern bool afs_check_validity(struct afs_vnode *); extern int afs_validate(struct afs_vnode *, struct key *); -extern int afs_getattr(struct user_namespace *mnt_userns, const struct path *, +extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); -extern int afs_setattr(struct user_namespace *mnt_userns, struct dentry *, struct iattr *); +extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); extern void afs_evict_inode(struct inode *); extern int afs_drop_inode(struct inode *); @@ -1387,7 +1388,7 @@ extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int, extern struct key *afs_request_key(struct afs_cell *); extern struct key *afs_request_key_rcu(struct afs_cell *); extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); -extern int afs_permission(struct user_namespace *, struct inode *, int); +extern int afs_permission(struct mnt_idmap *, struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); /* diff --git a/fs/afs/security.c b/fs/afs/security.c index 7c6a63a30394..6a7744c9e2a2 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -395,7 +395,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, * - AFS ACLs are attached to directories only, and a file is controlled by its * parent directory's ACL */ -int afs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int afs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct afs_vnode *vnode = AFS_FS_I(inode); diff --git a/fs/afs/write.c b/fs/afs/write.c index 19df10d63323..571f3b9a417e 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -704,85 +704,87 @@ static int afs_writepages_region(struct address_space *mapping, bool max_one_loop) { struct folio *folio; - struct page *head_page; + struct folio_batch fbatch; ssize_t ret; + unsigned int i; int n, skips = 0; _enter("%llx,%llx,", start, end); + folio_batch_init(&fbatch); do { pgoff_t index = start / PAGE_SIZE; - n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE, - PAGECACHE_TAG_DIRTY, 1, &head_page); + n = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE, + PAGECACHE_TAG_DIRTY, &fbatch); + if (!n) break; + for (i = 0; i < n; i++) { + folio = fbatch.folios[i]; + start = folio_pos(folio); /* May regress with THPs */ - folio = page_folio(head_page); - start = folio_pos(folio); /* May regress with THPs */ - - _debug("wback %lx", folio_index(folio)); + _debug("wback %lx", folio_index(folio)); - /* At this point we hold neither the i_pages lock nor the - * page lock: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled - * back from swapper_space to tmpfs file mapping - */ - if (wbc->sync_mode != WB_SYNC_NONE) { - ret = folio_lock_killable(folio); - if (ret < 0) { - folio_put(folio); - return ret; - } - } else { - if (!folio_trylock(folio)) { - folio_put(folio); - return 0; + /* At this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = folio_lock_killable(folio); + if (ret < 0) { + folio_batch_release(&fbatch); + return ret; + } + } else { + if (!folio_trylock(folio)) + continue; } - } - if (folio_mapping(folio) != mapping || - !folio_test_dirty(folio)) { - start += folio_size(folio); - folio_unlock(folio); - folio_put(folio); - continue; - } + if (folio->mapping != mapping || + !folio_test_dirty(folio)) { + start += folio_size(folio); + folio_unlock(folio); + continue; + } - if (folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) { - folio_wait_writeback(folio); + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + if (wbc->sync_mode != WB_SYNC_NONE) { + folio_wait_writeback(folio); #ifdef CONFIG_AFS_FSCACHE - folio_wait_fscache(folio); + folio_wait_fscache(folio); #endif - } else { - start += folio_size(folio); + } else { + start += folio_size(folio); + } + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) { + *_next = start; + _leave(" = 0 [%llx]", *_next); + return 0; + } + skips++; + } + continue; } - folio_put(folio); - if (wbc->sync_mode == WB_SYNC_NONE) { - if (skips >= 5 || need_resched()) - break; - skips++; + + if (!folio_clear_dirty_for_io(folio)) + BUG(); + ret = afs_write_back_from_locked_folio(mapping, wbc, + folio, start, end); + if (ret < 0) { + _leave(" = %zd", ret); + folio_batch_release(&fbatch); + return ret; } - continue; - } - if (!folio_clear_dirty_for_io(folio)) - BUG(); - ret = afs_write_back_from_locked_folio(mapping, wbc, folio, start, end); - folio_put(folio); - if (ret < 0) { - _leave(" = %zd", ret); - return ret; + start += ret; } - start += ret; - - if (max_one_loop) - break; - + folio_batch_release(&fbatch); cond_resched(); } while (wbc->nr_to_write > 0); @@ -992,7 +994,7 @@ int afs_launder_folio(struct folio *folio) { struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); struct iov_iter iter; - struct bio_vec bv[1]; + struct bio_vec bv; unsigned long priv; unsigned int f, t; int ret = 0; @@ -1008,10 +1010,8 @@ int afs_launder_folio(struct folio *folio) t = afs_folio_dirty_to(folio, priv); } - bv[0].bv_page = &folio->page; - bv[0].bv_offset = f; - bv[0].bv_len = t - f; - iov_iter_bvec(&iter, ITER_SOURCE, bv, 1, bv[0].bv_len); + bvec_set_folio(&bv, folio, t - f, f); + iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, bv.bv_len); trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio); ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true); diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 7751b0b3f81d..9048d8ccc715 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -97,7 +97,7 @@ static const struct afs_operation_ops afs_store_acl_operation = { * Set a file's AFS3 ACL. */ static int afs_xattr_set_acl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -228,7 +228,7 @@ static const struct afs_operation_ops yfs_store_opaque_acl2_operation = { * Set a file's YFS ACL. */ static int afs_xattr_set_yfs(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -361,6 +361,9 @@ static int aio_ring_mremap(struct vm_area_struct *vma) spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); + if (!table) + goto out_unlock; + for (i = 0; i < table->nr; i++) { struct kioctx *ctx; @@ -374,6 +377,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma) } } +out_unlock: rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); return res; @@ -390,7 +394,7 @@ static const struct vm_operations_struct aio_ring_vm_ops = { static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_ops = &aio_ring_vm_ops; return 0; } diff --git a/fs/attr.c b/fs/attr.c index b45f30e516fa..aca9ff7aed33 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -14,6 +14,7 @@ #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> +#include <linux/filelock.h> #include <linux/security.h> #include <linux/evm.h> #include <linux/ima.h> @@ -23,7 +24,7 @@ /** * setattr_should_drop_sgid - determine whether the setgid bit needs to be * removed - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the setgid bit needs to be removed. @@ -33,7 +34,7 @@ * * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. */ -int setattr_should_drop_sgid(struct user_namespace *mnt_userns, +int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode) { umode_t mode = inode->i_mode; @@ -42,8 +43,7 @@ int setattr_should_drop_sgid(struct user_namespace *mnt_userns, return 0; if (mode & S_IXGRP) return ATTR_KILL_SGID; - if (!in_group_or_capable(mnt_userns, inode, - i_gid_into_vfsgid(mnt_userns, inode))) + if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) return ATTR_KILL_SGID; return 0; } @@ -51,7 +51,7 @@ int setattr_should_drop_sgid(struct user_namespace *mnt_userns, /** * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to * be dropped - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the set{g,u}id bits need to be removed. @@ -63,7 +63,7 @@ int setattr_should_drop_sgid(struct user_namespace *mnt_userns, * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits * to remove, 0 otherwise. */ -int setattr_should_drop_suidgid(struct user_namespace *mnt_userns, +int setattr_should_drop_suidgid(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; @@ -73,7 +73,7 @@ int setattr_should_drop_suidgid(struct user_namespace *mnt_userns, if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; - kill |= setattr_should_drop_sgid(mnt_userns, inode); + kill |= setattr_should_drop_sgid(idmap, inode); if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; @@ -84,24 +84,24 @@ EXPORT_SYMBOL(setattr_should_drop_suidgid); /** * chown_ok - verify permissions to chown inode - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsuid: uid to chown @inode to * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. */ -static bool chown_ok(struct user_namespace *mnt_userns, +static bool chown_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsuid_t ia_vfsuid) { - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid()) && vfsuid_eq(ia_vfsuid, vfsuid)) return true; - if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) + if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsuid_valid(vfsuid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) @@ -111,28 +111,28 @@ static bool chown_ok(struct user_namespace *mnt_userns, /** * chgrp_ok - verify permissions to chgrp inode - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsgid: gid to chown @inode to * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. */ -static bool chgrp_ok(struct user_namespace *mnt_userns, +static bool chgrp_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t ia_vfsgid) { - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) { if (vfsgid_eq(ia_vfsgid, vfsgid)) return true; if (vfsgid_in_group_p(ia_vfsgid)) return true; } - if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) + if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsgid_valid(vfsgid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) @@ -142,7 +142,7 @@ static bool chgrp_ok(struct user_namespace *mnt_userns, /** * setattr_prepare - check if attribute changes to a dentry are allowed - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: dentry to check * @attr: attributes to change * @@ -152,16 +152,16 @@ static bool chgrp_ok(struct user_namespace *mnt_userns, * SGID bit from mode if user is not allowed to set it. Also file capabilities * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. */ -int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, +int setattr_prepare(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -183,34 +183,34 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && - !chown_ok(mnt_userns, inode, attr->ia_vfsuid)) + !chown_ok(idmap, inode, attr->ia_vfsuid)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && - !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid)) + !chgrp_ok(idmap, inode, attr->ia_vfsgid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { vfsgid_t vfsgid; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (ia_valid & ATTR_GID) vfsgid = attr->ia_vfsgid; else - vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); /* Also check the setgid bit! */ - if (!in_group_or_capable(mnt_userns, inode, vfsgid)) + if (!in_group_or_capable(idmap, inode, vfsgid)) attr->ia_mode &= ~S_ISGID; } /* Check for setting the inode time. */ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; } @@ -219,7 +219,7 @@ kill_priv: if (ia_valid & ATTR_KILL_PRIV) { int error; - error = security_inode_killpriv(mnt_userns, dentry); + error = security_inode_killpriv(idmap, dentry); if (error) return error; } @@ -276,7 +276,7 @@ EXPORT_SYMBOL(inode_newsize_ok); /** * setattr_copy - copy simple metadata updates into the generic inode - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the inode to be updated * @attr: the new attributes * @@ -289,23 +289,23 @@ EXPORT_SYMBOL(inode_newsize_ok); * Noticeably missing is inode size update, which is more complex * as it requires pagecache updates. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. * * The inode is not marked as dirty after this operation. The rationale is * that for "simple" filesystems, the struct inode is the inode storage. * The caller is free to mark the inode dirty afterwards if needed. */ -void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, +void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; - i_uid_update(mnt_userns, attr, inode); - i_gid_update(mnt_userns, attr, inode); + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -314,15 +314,15 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_or_capable(mnt_userns, inode, - i_gid_into_vfsgid(mnt_userns, inode))) + if (!in_group_or_capable(idmap, inode, + i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; inode->i_mode = mode; } } EXPORT_SYMBOL(setattr_copy); -int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, +int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid) { int error; @@ -340,8 +340,8 @@ int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, if (IS_IMMUTABLE(inode)) return -EPERM; - if (!inode_owner_or_capable(mnt_userns, inode)) { - error = inode_permission(mnt_userns, inode, MAY_WRITE); + if (!inode_owner_or_capable(idmap, inode)) { + error = inode_permission(idmap, inode, MAY_WRITE); if (error) return error; } @@ -352,7 +352,7 @@ EXPORT_SYMBOL(may_setattr); /** * notify_change - modify attributes of a filesytem object - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: object affected * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated @@ -371,13 +371,13 @@ EXPORT_SYMBOL(may_setattr); * the file open for write, as there can be no conflicting delegation in * that case. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply pass @nop_mnt_idmap. */ -int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, +int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; @@ -388,7 +388,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, WARN_ON_ONCE(!inode_is_locked(inode)); - error = may_setattr(mnt_userns, inode, ia_valid); + error = may_setattr(idmap, inode, ia_valid); if (error) return error; @@ -453,11 +453,11 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, * namespace of the superblock. */ if (ia_valid & ATTR_UID && - !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + !vfsuid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsuid)) return -EOVERFLOW; if (ia_valid & ATTR_GID && - !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + !vfsgid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsgid)) return -EOVERFLOW; @@ -465,13 +465,13 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, * gids unless those uids & gids are being made valid. */ if (!(ia_valid & ATTR_UID) && - !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode))) + !vfsuid_valid(i_uid_into_vfsuid(idmap, inode))) return -EOVERFLOW; if (!(ia_valid & ATTR_GID) && - !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; - error = security_inode_setattr(mnt_userns, dentry, attr); + error = security_inode_setattr(idmap, dentry, attr); if (error) return error; error = try_break_deleg(inode, delegated_inode); @@ -479,13 +479,13 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, return error; if (inode->i_op->setattr) - error = inode->i_op->setattr(mnt_userns, dentry, attr); + error = inode->i_op->setattr(idmap, dentry, attr); else - error = simple_setattr(mnt_userns, dentry, attr); + error = simple_setattr(idmap, dentry, attr); if (!error) { fsnotify_change(dentry, ia_valid); - ima_inode_post_setattr(mnt_userns, dentry); + ima_inode_post_setattr(idmap, dentry); evm_inode_post_setattr(dentry, ia_valid); } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index ca03c1cae2be..6baf90b08e0e 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -10,12 +10,12 @@ #include "autofs_i.h" -static int autofs_dir_permission(struct user_namespace *, struct inode *, int); -static int autofs_dir_symlink(struct user_namespace *, struct inode *, +static int autofs_dir_permission(struct mnt_idmap *, struct inode *, int); +static int autofs_dir_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); static int autofs_dir_unlink(struct inode *, struct dentry *); static int autofs_dir_rmdir(struct inode *, struct dentry *); -static int autofs_dir_mkdir(struct user_namespace *, struct inode *, +static int autofs_dir_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT @@ -543,7 +543,7 @@ static struct dentry *autofs_lookup(struct inode *dir, return NULL; } -static int autofs_dir_permission(struct user_namespace *mnt_userns, +static int autofs_dir_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (mask & MAY_WRITE) { @@ -560,10 +560,10 @@ static int autofs_dir_permission(struct user_namespace *mnt_userns, return -EACCES; } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } -static int autofs_dir_symlink(struct user_namespace *mnt_userns, +static int autofs_dir_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { @@ -720,7 +720,7 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs_dir_mkdir(struct user_namespace *mnt_userns, +static int autofs_dir_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 92737166203f..db649487d58c 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -27,7 +27,7 @@ static const struct file_operations bad_file_ops = .open = bad_file_open, }; -static int bad_inode_create(struct user_namespace *mnt_userns, +static int bad_inode_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -51,14 +51,14 @@ static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) return -EIO; } -static int bad_inode_symlink(struct user_namespace *mnt_userns, +static int bad_inode_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } -static int bad_inode_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return -EIO; @@ -69,13 +69,13 @@ static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) return -EIO; } -static int bad_inode_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } -static int bad_inode_rename2(struct user_namespace *mnt_userns, +static int bad_inode_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -89,20 +89,20 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, return -EIO; } -static int bad_inode_permission(struct user_namespace *mnt_userns, +static int bad_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EIO; } -static int bad_inode_getattr(struct user_namespace *mnt_userns, +static int bad_inode_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } -static int bad_inode_setattr(struct user_namespace *mnt_userns, +static int bad_inode_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { return -EIO; @@ -146,14 +146,14 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, return -EIO; } -static int bad_inode_tmpfile(struct user_namespace *mnt_userns, +static int bad_inode_tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return -EIO; } -static int bad_inode_set_acl(struct user_namespace *mnt_userns, +static int bad_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 34d4f68f786b..040d5140e426 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -75,7 +75,7 @@ const struct file_operations bfs_dir_operations = { .llseek = generic_file_llseek, }; -static int bfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int bfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { int err; @@ -96,7 +96,7 @@ static int bfs_create(struct user_namespace *mnt_userns, struct inode *dir, } set_bit(ino, info->si_imap); info->si_freei--; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; @@ -199,7 +199,7 @@ out_brelse: return error; } -static int bfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 9a780fafc539..8a884e795f6a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -46,6 +46,7 @@ #include <linux/cred.h> #include <linux/dax.h> #include <linux/uaccess.h> +#include <linux/rseq.h> #include <asm/param.h> #include <asm/page.h> @@ -288,6 +289,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, if (bprm->have_execfd) { NEW_AUX_ENT(AT_EXECFD, bprm->execfd); } +#ifdef CONFIG_RSEQ + NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end)); + NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq)); +#endif #undef NEW_AUX_ENT /* AT_NULL is zero; clear the rest too */ memset(elf_info, 0, (char *)mm->saved_auxv + diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 183e5c4aed34..37b6bab90c83 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -17,7 +17,6 @@ config BTRFS_FS select FS_IOMAP select RAID6_PQ select XOR_BLOCKS - select SRCU depends on PAGE_SIZE_LESS_THAN_256KB help diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 555c962fdad6..90d53209755b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,8 @@ condflags := \ $(call cc-option, -Wunused-but-set-variable) \ $(call cc-option, -Wunused-const-variable) \ $(call cc-option, -Wpacked-not-aligned) \ - $(call cc-option, -Wstringop-truncation) + $(call cc-option, -Wstringop-truncation) \ + $(call cc-option, -Wmaybe-uninitialized) subdir-ccflags-y += $(condflags) # The following turn off the warnings enabled by -Wextra subdir-ccflags-y += -Wno-missing-field-initializers @@ -31,7 +32,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ + lru_cache.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 3da1779e8b79..7427449a04a3 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -110,7 +110,7 @@ out: return ret; } -int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int ret; @@ -118,7 +118,7 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t old_mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(mnt_userns, inode, + ret = posix_acl_update_mode(idmap, inode, &inode->i_mode, &acl); if (ret) return ret; diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index 39bd36e6eeb7..a270e71ec05f 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -6,7 +6,7 @@ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); -int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 46851511b661..90e40d5ceccd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1252,8 +1252,12 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct struct btrfs_root *root, u64 bytenr, int level, bool *is_shared) { + const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + if (!ctx->use_path_cache) return false; @@ -1288,7 +1292,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct * could be a snapshot sharing this extent buffer. */ if (entry->is_shared && - entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + entry->gen != btrfs_get_last_root_drop_gen(fs_info)) return false; *is_shared = entry->is_shared; @@ -1318,9 +1322,13 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx struct btrfs_root *root, u64 bytenr, int level, bool is_shared) { + const struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_backref_shared_cache_entry *entry; u64 gen; + if (!current->journal_info) + lockdep_assert_held(&fs_info->commit_root_sem); + if (!ctx->use_path_cache) return; @@ -1336,7 +1344,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx ASSERT(level >= 0); if (is_shared) - gen = btrfs_get_last_root_drop_gen(root->fs_info); + gen = btrfs_get_last_root_drop_gen(fs_info); else gen = btrfs_root_last_snapshot(&root->root_item); @@ -1864,6 +1872,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, .have_delayed_delete_refs = false, }; int level; + bool leaf_cached; + bool leaf_is_shared; for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { if (ctx->prev_extents_cache[i].bytenr == bytenr) @@ -1885,6 +1895,23 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, walk_ctx.time_seq = elem.seq; } + ctx->use_path_cache = true; + + /* + * We may have previously determined that the current leaf is shared. + * If it is, then we have a data extent that is shared due to a shared + * subtree (caused by snapshotting) and we don't need to check for data + * backrefs. If the leaf is not shared, then we must do backref walking + * to determine if the data extent is shared through reflinks. + */ + leaf_cached = lookup_backref_shared_cache(ctx, root, + ctx->curr_leaf_bytenr, 0, + &leaf_is_shared); + if (leaf_cached && leaf_is_shared) { + ret = 1; + goto out_trans; + } + walk_ctx.ignore_extent_item_pos = true; walk_ctx.trans = trans; walk_ctx.fs_info = fs_info; @@ -1893,7 +1920,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); - ctx->use_path_cache = true; while (1) { bool is_shared; bool cached; @@ -1964,6 +1990,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, ctx->prev_extents_cache_slot = slot; } +out_trans: if (trans) { btrfs_put_tree_mod_seq(fs_info, &elem); btrfs_end_transaction(trans); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 8affc88b0e0a..d8b90f95b157 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -14,19 +14,31 @@ #include "dev-replace.h" #include "rcu-string.h" #include "zoned.h" +#include "file-item.h" static struct bio_set btrfs_bioset; +static struct bio_set btrfs_clone_bioset; +static struct bio_set btrfs_repair_bioset; +static mempool_t btrfs_failed_bio_pool; + +struct btrfs_failed_bio { + struct btrfs_bio *bbio; + int num_copies; + atomic_t repair_count; +}; /* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio, - btrfs_bio_end_io_t end_io, void *private) +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private) { memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->inode = inode; bbio->end_io = end_io; bbio->private = private; + atomic_set(&bbio->pending_ios, 1); } /* @@ -37,32 +49,235 @@ static inline void btrfs_bio_init(struct btrfs_bio *bbio, * a mempool. */ struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_inode *inode, btrfs_bio_end_io_t end_io, void *private) { struct bio *bio; bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio), end_io, private); + btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); return bio; } -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private) +static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, + struct bio *orig, u64 map_length, + bool use_append) { + struct btrfs_bio *orig_bbio = btrfs_bio(orig); struct bio *bio; - struct btrfs_bio *bbio; - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + if (use_append) { + unsigned int nr_segs; + + bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, + &btrfs_clone_bioset, map_length); + } else { + bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); + } + btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); - bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, end_io, private); + btrfs_bio(bio)->file_offset = orig_bbio->file_offset; + if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) + orig_bbio->file_offset += map_length; - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; + atomic_inc(&orig_bbio->pending_ios); return bio; } +static void btrfs_orig_write_end_io(struct bio *bio); + +static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, + struct btrfs_bio *orig_bbio) +{ + /* + * For writes we tolerate nr_mirrors - 1 write failures, so we can't + * just blindly propagate a write failure here. Instead increment the + * error count in the original I/O context so that it is guaranteed to + * be larger than the error tolerance. + */ + if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { + struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; + struct btrfs_io_context *orig_bioc = orig_stripe->bioc; + + atomic_add(orig_bioc->max_errors, &orig_bioc->error); + } else { + orig_bbio->bio.bi_status = bbio->bio.bi_status; + } +} + +static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_pool == &btrfs_clone_bioset) { + struct btrfs_bio *orig_bbio = bbio->private; + + if (bbio->bio.bi_status) + btrfs_bbio_propagate_error(bbio, orig_bbio); + bio_put(&bbio->bio); + bbio = orig_bbio; + } + + if (atomic_dec_and_test(&bbio->pending_ios)) + bbio->end_io(bbio); +} + +static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == fbio->num_copies) + return cur_mirror + 1 - fbio->num_copies; + return cur_mirror + 1; +} + +static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +{ + if (cur_mirror == 1) + return fbio->num_copies; + return cur_mirror - 1; +} + +static void btrfs_repair_done(struct btrfs_failed_bio *fbio) +{ + if (atomic_dec_and_test(&fbio->repair_count)) { + btrfs_orig_bbio_end_io(fbio->bbio); + mempool_free(fbio, &btrfs_failed_bio_pool); + } +} + +static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, + struct btrfs_device *dev) +{ + struct btrfs_failed_bio *fbio = repair_bbio->private; + struct btrfs_inode *inode = repair_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); + int mirror = repair_bbio->mirror_num; + + if (repair_bbio->bio.bi_status || + !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { + bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); + repair_bbio->bio.bi_iter = repair_bbio->saved_iter; + + mirror = next_repair_mirror(fbio, mirror); + if (mirror == fbio->bbio->mirror_num) { + btrfs_debug(fs_info, "no mirror left"); + fbio->bbio->bio.bi_status = BLK_STS_IOERR; + goto done; + } + + btrfs_submit_bio(&repair_bbio->bio, mirror); + return; + } + + do { + mirror = prev_repair_mirror(fbio, mirror); + btrfs_repair_io_failure(fs_info, btrfs_ino(inode), + repair_bbio->file_offset, fs_info->sectorsize, + repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, + bv->bv_page, bv->bv_offset, mirror); + } while (mirror != fbio->bbio->mirror_num); + +done: + btrfs_repair_done(fbio); + bio_put(&repair_bbio->bio); +} + +/* + * Try to kick off a repair read to the next available mirror for a bad sector. + * + * This primarily tries to recover good data to serve the actual read request, + * but also tries to write the good data back to the bad mirror(s) when a + * read succeeded to restore the redundancy. + */ +static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, + u32 bio_offset, + struct bio_vec *bv, + struct btrfs_failed_bio *fbio) +{ + struct btrfs_inode *inode = failed_bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_bio *repair_bbio; + struct bio *repair_bio; + int num_copies; + int mirror; + + btrfs_debug(fs_info, "repair read error: read error at %llu", + failed_bbio->file_offset + bio_offset); + + num_copies = btrfs_num_copies(fs_info, logical, sectorsize); + if (num_copies == 1) { + btrfs_debug(fs_info, "no copy to repair from"); + failed_bbio->bio.bi_status = BLK_STS_IOERR; + return fbio; + } + + if (!fbio) { + fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); + fbio->bbio = failed_bbio; + fbio->num_copies = num_copies; + atomic_set(&fbio->repair_count, 1); + } + + atomic_inc(&fbio->repair_count); + + repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, + &btrfs_repair_bioset); + repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; + bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + + repair_bbio = btrfs_bio(repair_bio); + btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); + repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; + + mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); + btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); + btrfs_submit_bio(repair_bio, mirror); + return fbio; +} + +static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 sectorsize = fs_info->sectorsize; + struct bvec_iter *iter = &bbio->saved_iter; + blk_status_t status = bbio->bio.bi_status; + struct btrfs_failed_bio *fbio = NULL; + u32 offset = 0; + + /* + * Hand off repair bios to the repair code as there is no upper level + * submitter for them. + */ + if (bbio->bio.bi_pool == &btrfs_repair_bioset) { + btrfs_end_repair_bio(bbio, dev); + return; + } + + /* Clear the I/O error. A failed repair will reset it. */ + bbio->bio.bi_status = BLK_STS_OK; + + while (iter->bi_size) { + struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); + + bv.bv_len = min(bv.bv_len, sectorsize); + if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) + fbio = repair_one_sector(bbio, offset, &bv, fbio); + + bio_advance_iter_single(&bbio->bio, iter, sectorsize); + offset += sectorsize; + } + + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + + if (fbio) + btrfs_repair_done(fbio); + else + btrfs_orig_bbio_end_io(bbio); +} + static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) { if (!dev || !dev->bdev) @@ -90,24 +305,31 @@ static void btrfs_end_bio_work(struct work_struct *work) { struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); - bbio->end_io(bbio); + /* Metadata reads are checked and repaired by the submitter. */ + if (bbio->bio.bi_opf & REQ_META) + bbio->end_io(bbio); + else + btrfs_check_read_bio(bbio, bbio->bio.bi_private); } static void btrfs_simple_end_io(struct bio *bio) { - struct btrfs_fs_info *fs_info = bio->bi_private; struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_device *dev = bio->bi_private; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; btrfs_bio_counter_dec(fs_info); if (bio->bi_status) - btrfs_log_dev_io_error(bio, bbio->device); + btrfs_log_dev_io_error(bio, dev); if (bio_op(bio) == REQ_OP_READ) { INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - bbio->end_io(bbio); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + btrfs_record_physical_zoned(bbio); + btrfs_orig_bbio_end_io(bbio); } } @@ -118,7 +340,10 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - bbio->end_io(bbio); + if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) + btrfs_check_read_bio(bbio, NULL); + else + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -145,7 +370,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - bbio->end_io(bbio); + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -181,16 +406,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) */ if (bio_op(bio) == REQ_OP_ZONE_APPEND) { u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 zone_start = round_down(physical, dev->fs_info->zone_size); - if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, - dev->fs_info->zone_size); - - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; - } else { - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; - bio->bi_opf |= REQ_OP_WRITE; - } + ASSERT(btrfs_dev_is_sequential(dev, physical)); + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } btrfs_debug_in_rcu(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", @@ -224,41 +443,21 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) +static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { - u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = bio->bi_iter.bi_size; - u64 map_length = length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap; - int ret; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); - if (ret) { - btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return; - } - - if (map_length < length) { - btrfs_crit(fs_info, - "mapping failed logical %llu bio len %llu len %llu", - logical, length, map_length); - BUG(); - } + /* Do not leak our private flag into the block layer. */ + bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; if (!bioc) { - /* Single mirror read/write fast path */ + /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; - btrfs_bio(bio)->device = smap.dev; - bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - bio->bi_private = fs_info; + bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + bio->bi_private = smap->dev; bio->bi_end_io = btrfs_simple_end_io; - btrfs_submit_dev_bio(smap.dev, bio); + btrfs_submit_dev_bio(smap->dev, bio); } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* Parity RAID write or read recovery */ + /* Parity RAID write or read recovery. */ bio->bi_private = bioc; bio->bi_end_io = btrfs_raid56_end_io; if (bio_op(bio) == REQ_OP_READ) @@ -266,16 +465,233 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror else raid56_parity_write(bio, bioc); } else { - /* Write to multiple mirrors */ + /* Write to multiple mirrors. */ int total_devs = bioc->num_stripes; - int dev_nr; bioc->orig_bio = bio; - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) btrfs_submit_mirrored_bio(bioc, dev_nr); } } +static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_opf & REQ_META) + return btree_csum_one_bio(bbio); + return btrfs_csum_one_bio(bbio); +} + +/* + * Async submit bios are used to offload expensive checksumming onto the worker + * threads. + */ +struct async_submit_bio { + struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe smap; + int mirror_num; + struct btrfs_work work; +}; + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the btree. + */ +static void run_one_async_start(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + blk_status_t ret; + + ret = btrfs_bio_csum(async->bbio); + if (ret) + async->bbio->bio.bi_status = ret; +} + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */ +static void run_one_async_done(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + struct bio *bio = &async->bbio->bio; + + /* If an error occurred we just want to clean up the bio and move on. */ + if (bio->bi_status) { + btrfs_orig_bbio_end_io(async->bbio); + return; + } + + /* + * All of the bios that pass through here are from async helpers. + * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. + * This changes nothing when cgroups aren't in use. + */ + bio->bi_opf |= REQ_CGROUP_PUNT; + __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + kfree(container_of(work, struct async_submit_bio, work)); +} + +static bool should_async_write(struct btrfs_bio *bbio) +{ + /* + * If the I/O is not issued by fsync and friends, (->sync_writers != 0), + * then try to defer the submission to a workqueue to parallelize the + * checksum calculation. + */ + if (atomic_read(&bbio->inode->sync_writers)) + return false; + + /* + * Submit metadata writes synchronously if the checksum implementation + * is fast, or we are on a zoned device that wants I/O to be submitted + * in order. + */ + if (bbio->bio.bi_opf & REQ_META) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + + if (btrfs_is_zoned(fs_info)) + return false; + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) + return false; + } + + return true; +} + +/* + * Submit bio to an async queue. + * + * Return true if the work has been succesfuly submitted, else false. + */ +static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, + struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) +{ + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return false; + + async->bbio = bbio; + async->bioc = bioc; + async->smap = *smap; + async->mirror_num = mirror_num; + + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, + run_one_async_free); + if (op_is_sync(bbio->bio.bi_opf)) + btrfs_queue_work(fs_info->hipri_workers, &async->work); + else + btrfs_queue_work(fs_info->workers, &async->work); + return true; +} + +static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) +{ + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_bio *orig_bbio = bbio; + u64 logical = bio->bi_iter.bi_sector << 9; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; + bool use_append = btrfs_use_zone_append(bbio); + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap; + blk_status_t ret; + int error; + + btrfs_bio_counter_inc_blocked(fs_info); + error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); + if (error) { + ret = errno_to_blk_status(error); + goto fail; + } + + map_length = min(map_length, length); + if (use_append) + map_length = min(map_length, fs_info->max_zone_append_size); + + if (map_length < length) { + bio = btrfs_split_bio(fs_info, bio, map_length, use_append); + bbio = btrfs_bio(bio); + } + + /* + * Save the iter for the end_io handler and preload the checksums for + * data reads. + */ + if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { + bbio->saved_iter = bio->bi_iter; + ret = btrfs_lookup_bio_sums(bbio); + if (ret) + goto fail_put_bio; + } + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + if (use_append) { + bio->bi_opf &= ~REQ_OP_WRITE; + bio->bi_opf |= REQ_OP_ZONE_APPEND; + ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); + if (ret) + goto fail_put_bio; + } + + /* + * Csum items for reloc roots have already been cloned at this + * point, so they are handled as part of the no-checksum case. + */ + if (!(inode->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(inode->root)) { + if (should_async_write(bbio) && + btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) + goto done; + + ret = btrfs_bio_csum(bbio); + if (ret) + goto fail_put_bio; + } + } + + __btrfs_submit_bio(bio, bioc, &smap, mirror_num); +done: + return map_length == length; + +fail_put_bio: + if (map_length < length) + bio_put(bio); +fail: + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(orig_bbio, ret); + /* Do not submit another chunk */ + return true; +} + +void btrfs_submit_bio(struct bio *bio, int mirror_num) +{ + while (!btrfs_submit_chunk(bio, mirror_num)) + ; +} + /* * Submit a repair write. * @@ -283,7 +699,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * - * The I/O is issued sychronously to block the repair read completion from + * The I/O is issued synchronously to block the repair read completion from * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, @@ -381,10 +797,31 @@ int __init btrfs_bioset_init(void) offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) return -ENOMEM; + if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), 0)) + goto out_free_bioset; + if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + goto out_free_clone_bioset; + if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, + sizeof(struct btrfs_failed_bio))) + goto out_free_repair_bioset; return 0; + +out_free_repair_bioset: + bioset_exit(&btrfs_repair_bioset); +out_free_clone_bioset: + bioset_exit(&btrfs_clone_bioset); +out_free_bioset: + bioset_exit(&btrfs_bioset); + return -ENOMEM; } void __cold btrfs_bioset_exit(void) { + mempool_exit(&btrfs_failed_bio_pool); + bioset_exit(&btrfs_repair_bioset); + bioset_exit(&btrfs_clone_bioset); bioset_exit(&btrfs_bioset); } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index b12f84b3b341..873ff85817f0 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -26,32 +26,23 @@ struct btrfs_fs_info; typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* - * Additional info to pass along bio. - * - * Mostly for btrfs specific features like csum and mirror_num. + * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and + * passed to btrfs_submit_bio for mapping to the physical devices. */ struct btrfs_bio { - unsigned int mirror_num:7; - - /* - * Extra indicator for metadata bios. - * For some btrfs bios they use pages without a mapping, thus - * we can not rely on page->mapping->host to determine if - * it's a metadata bio. - */ - unsigned int is_metadata:1; - struct bvec_iter iter; - - /* for direct I/O */ + /* Inode and offset into it that this I/O operates on. */ + struct btrfs_inode *inode; u64 file_offset; - /* @device is for stripe IO submission. */ - struct btrfs_device *device; union { - /* For data checksum verification. */ + /* + * Data checksumming and original I/O information for internal + * use in the btrfs_submit_bio machinery. + */ struct { u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + struct bvec_iter saved_iter; }; /* For metadata parentness verification. */ @@ -62,7 +53,9 @@ struct btrfs_bio { btrfs_bio_end_io_t end_io; void *private; - /* For read end I/O handling */ + /* For internal use in read end I/O handling */ + unsigned int mirror_num; + atomic_t pending_ios; struct work_struct end_io_work; /* @@ -80,11 +73,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) int __init btrfs_bioset_init(void); void __cold btrfs_bioset_exit(void); +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private); struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_inode *inode, btrfs_bio_end_io_t end_io, void *private); -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private); - static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { @@ -92,34 +85,10 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) bbio->end_io(bbio); } -static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) -{ - if (bbio->is_metadata) - return; - if (bbio->csum != bbio->csum_inline) { - kfree(bbio->csum); - bbio->csum = NULL; - } -} +/* Bio only refers to one ordered extent. */ +#define REQ_BTRFS_ONE_ORDERED REQ_DRV -/* - * Iterate through a btrfs_bio (@bbio) on a per-sector basis. - * - * bvl - struct bio_vec - * bbio - struct btrfs_bio - * iters - struct bvec_iter - * bio_offset - unsigned int - */ -#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ - for ((iter) = (bbio)->iter, (bio_offset) = 0; \ - (iter).bi_size && \ - (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ - (bio_offset) += fs_info->sectorsize, \ - bio_advance_iter_single(&(bbio)->bio, &(iter), \ - (fs_info)->sectorsize)) - -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num); +void btrfs_submit_bio(struct bio *bio, int mirror_num); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 708d843daa72..5b10401d803b 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/sizes.h> #include <linux/list_sort.h> #include "misc.h" #include "ctree.h" @@ -539,6 +540,153 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end return total_added; } +/* + * Get an arbitrary extent item index / max_index through the block group + * + * @block_group the block group to sample from + * @index: the integral step through the block group to grab from + * @max_index: the granularity of the sampling + * @key: return value parameter for the item we find + * + * Pre-conditions on indices: + * 0 <= index <= max_index + * 0 < max_index + * + * Returns: 0 on success, 1 if the search didn't yield a useful item, negative + * error code on error. + */ +static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group, + int index, int max_index, + struct btrfs_key *key) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *extent_root; + int ret = 0; + u64 search_offset; + u64 search_end = block_group->start + block_group->length; + struct btrfs_path *path; + + ASSERT(index >= 0); + ASSERT(index <= max_index); + ASSERT(max_index > 0); + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, + BTRFS_SUPER_INFO_OFFSET)); + + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = READA_FORWARD; + + search_offset = index * div_u64(block_group->length, max_index); + key->objectid = block_group->start + search_offset; + key->type = BTRFS_EXTENT_ITEM_KEY; + key->offset = 0; + + while (1) { + ret = btrfs_search_forward(extent_root, key, path, 0); + if (ret != 0) + goto out; + /* Success; sampled an extent item in the block group */ + if (key->type == BTRFS_EXTENT_ITEM_KEY && + key->objectid >= block_group->start && + key->objectid + key->offset <= search_end) + goto out; + + /* We can't possibly find a valid extent item anymore */ + if (key->objectid >= search_end) { + ret = 1; + break; + } + if (key->type < BTRFS_EXTENT_ITEM_KEY) + key->type = BTRFS_EXTENT_ITEM_KEY; + else + key->objectid++; + btrfs_release_path(path); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&caching_ctl->mutex); + cond_resched(); + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + } +out: + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); + btrfs_free_path(path); + return ret; +} + +/* + * Best effort attempt to compute a block group's size class while caching it. + * + * @block_group: the block group we are caching + * + * We cannot infer the size class while adding free space extents, because that + * logic doesn't care about contiguous file extents (it doesn't differentiate + * between a 100M extent and 100 contiguous 1M extents). So we need to read the + * file extent items. Reading all of them is quite wasteful, because usually + * only a handful are enough to give a good answer. Therefore, we just grab 5 of + * them at even steps through the block group and pick the smallest size class + * we see. Since size class is best effort, and not guaranteed in general, + * inaccuracy is acceptable. + * + * To be more explicit about why this algorithm makes sense: + * + * If we are caching in a block group from disk, then there are three major cases + * to consider: + * 1. the block group is well behaved and all extents in it are the same size + * class. + * 2. the block group is mostly one size class with rare exceptions for last + * ditch allocations + * 3. the block group was populated before size classes and can have a totally + * arbitrary mix of size classes. + * + * In case 1, looking at any extent in the block group will yield the correct + * result. For the mixed cases, taking the minimum size class seems like a good + * approximation, since gaps from frees will be usable to the size class. For + * 2., a small handful of file extents is likely to yield the right answer. For + * 3, we can either read every file extent, or admit that this is best effort + * anyway and try to stay fast. + * + * Returns: 0 on success, negative error code on error. + */ +static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group) +{ + struct btrfs_key key; + int i; + u64 min_size = block_group->length; + enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; + int ret; + + if (!btrfs_block_group_should_use_size_class(block_group)) + return 0; + + for (i = 0; i < 5; ++i) { + ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); + if (ret < 0) + goto out; + if (ret > 0) + continue; + min_size = min_t(u64, min_size, key.offset); + size_class = btrfs_calc_block_group_size_class(min_size); + } + if (size_class != BTRFS_BG_SZ_NONE) { + spin_lock(&block_group->lock); + block_group->size_class = size_class; + spin_unlock(&block_group->lock); + } + +out: + return ret; +} + static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group = caching_ctl->block_group; @@ -683,6 +831,7 @@ static noinline void caching_thread(struct btrfs_work *work) mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); + load_block_group_size_class(caching_ctl, block_group); if (btrfs_test_opt(fs_info, SPACE_CACHE)) { ret = load_free_space_cache(block_group); if (ret == 1) { @@ -1816,7 +1965,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * * @fs_info: the filesystem * @chunk_start: logical address of block group - * @bdev: physical device to resolve, can be NULL to indicate any device * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical * @naddrs: length of @logical @@ -1827,8 +1975,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * block copies. */ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - struct block_device *bdev, u64 physical, u64 **logical, - int *naddrs, int *stripe_len) + u64 physical, u64 **logical, int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -1868,9 +2015,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, data_stripe_length)) continue; - if (bdev && map->stripes[i].dev->bdev != bdev) - continue; - stripe_nr = physical - map->stripes[i].physical; stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); @@ -1927,7 +2071,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->start, NULL, + ret = btrfs_rmap_block(fs_info, cache->start, bytenr, &logical, &nr, &stripe_len); if (ret) return ret; @@ -3330,7 +3474,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { - bool reclaim; + bool reclaim = false; cache = btrfs_lookup_block_group(info, bytenr); if (!cache) { @@ -3379,6 +3523,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->space_info->disk_used -= num_bytes * factor; reclaim = should_reclaim_block_group(cache, num_bytes); + spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -3433,32 +3578,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc) + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class) { struct btrfs_space_info *space_info = cache->space_info; + enum btrfs_block_group_size_class size_class; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); if (cache->ro) { ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - trace_btrfs_space_reservation(cache->fs_info, "space_info", - space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; + goto out; + } - /* - * Compression can use less space than we reserved, so wake - * tickets if that happens - */ - if (num_bytes < ram_bytes) - btrfs_try_granting_tickets(cache->fs_info, space_info); + if (btrfs_block_group_should_use_size_class(cache)) { + size_class = btrfs_calc_block_group_size_class(num_bytes); + ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); + if (ret) + goto out; } + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", + space_info->flags, num_bytes, 1); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); + if (delalloc) + cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake tickets if + * that happens. + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); +out: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; @@ -4218,3 +4373,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount bg->swap_extents -= amount; spin_unlock(&bg->lock); } + +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) +{ + if (size <= SZ_128K) + return BTRFS_BG_SZ_SMALL; + if (size <= SZ_8M) + return BTRFS_BG_SZ_MEDIUM; + return BTRFS_BG_SZ_LARGE; +} + +/* + * Handle a block group allocating an extent in a size class + * + * @bg: The block group we allocated in. + * @size_class: The size class of the allocation. + * @force_wrong_size_class: Whether we are desperate enough to allow + * mismatched size classes. + * + * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the + * case of a race that leads to the wrong size class without + * force_wrong_size_class set. + * + * find_free_extent will skip block groups with a mismatched size class until + * it really needs to avoid ENOSPC. In that case it will set + * force_wrong_size_class. However, if a block group is newly allocated and + * doesn't yet have a size class, then it is possible for two allocations of + * different sizes to race and both try to use it. The loser is caught here and + * has to retry. + */ +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class) +{ + ASSERT(size_class != BTRFS_BG_SZ_NONE); + + /* The new allocation is in the right size class, do nothing */ + if (bg->size_class == size_class) + return 0; + /* + * The new allocation is in a mismatched size class. + * This means one of two things: + * + * 1. Two tasks in find_free_extent for different size_classes raced + * and hit the same empty block_group. Make the loser try again. + * 2. A call to find_free_extent got desperate enough to set + * 'force_wrong_slab'. Don't change the size_class, but allow the + * allocation. + */ + if (bg->size_class != BTRFS_BG_SZ_NONE) { + if (force_wrong_size_class) + return 0; + return -EAGAIN; + } + /* + * The happy new block group case: the new allocation is the first + * one in the block_group so we set size_class. + */ + bg->size_class = size_class; + + return 0; +} + +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +{ + if (btrfs_is_zoned(bg->fs_info)) + return false; + if (!btrfs_is_block_group_data_only(bg)) + return false; + return true; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a02ea76fd6cf..6e4a0b429ac3 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -12,6 +12,17 @@ enum btrfs_disk_cache_state { BTRFS_DC_SETUP, }; +enum btrfs_block_group_size_class { + /* Unset */ + BTRFS_BG_SZ_NONE, + /* 0 < size <= 128K */ + BTRFS_BG_SZ_SMALL, + /* 128K < size <= 8M */ + BTRFS_BG_SZ_MEDIUM, + /* 8M < size < BG_LENGTH */ + BTRFS_BG_SZ_LARGE, +}; + /* * This describes the state of the block_group for async discard. This is due * to the two pass nature of it where extent discarding is prioritized over @@ -233,6 +244,7 @@ struct btrfs_block_group { struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; + enum btrfs_block_group_size_class size_class; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -302,7 +314,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc); + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, int delalloc); int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, @@ -315,8 +328,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - struct block_device *bdev, u64 physical, u64 **logical, - int *naddrs, int *stripe_len); + u64 physical, u64 **logical, int *naddrs, int *stripe_len); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { @@ -346,4 +358,10 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class); +bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); + #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 195c09e20609..9dc21622806e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -93,12 +93,6 @@ struct btrfs_inode { /* the io_tree does range state (DIRTY, LOCKED etc) */ struct extent_io_tree io_tree; - /* special utility tree used to record which mirrors have already been - * tried when checksums fail for a given block - */ - struct rb_root io_failure_tree; - spinlock_t io_failure_lock; - /* * Keep track of where the inode has extent items mapped in order to * make sure the i_size adjustments are accurate @@ -411,21 +405,11 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); -void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type); -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); -blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, - struct bio *bio, - u64 dio_file_offset); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end); +blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio); +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict); @@ -469,7 +453,7 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args); void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); -struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, +struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, struct inode *dir); void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5122ca79f7ea..f42f31f22d13 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -141,12 +141,15 @@ static int compression_decompress(int type, struct list_head *ws, static int btrfs_decompress_bio(struct compressed_bio *cb); -static void finish_compressed_bio_read(struct compressed_bio *cb) +static void end_compressed_bio_read(struct btrfs_bio *bbio) { + struct compressed_bio *cb = bbio->private; unsigned int index; struct page *page; - if (cb->status == BLK_STS_OK) + if (bbio->bio.bi_status) + cb->status = bbio->bio.bi_status; + else cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); /* Release the compressed pages */ @@ -162,54 +165,6 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) /* Finally free the cb struct */ kfree(cb->compressed_pages); kfree(cb); -} - -/* - * Verify the checksums and kick off repair if needed on the uncompressed data - * before decompressing it into the original bio and freeing the uncompressed - * pages. - */ -static void end_compressed_bio_read(struct btrfs_bio *bbio) -{ - struct compressed_bio *cb = bbio->private; - struct inode *inode = cb->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *bi = BTRFS_I(inode); - bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - blk_status_t status = bbio->bio.bi_status; - struct bvec_iter iter; - struct bio_vec bv; - u32 offset; - - btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { - u64 start = bbio->file_offset + offset; - - if (!status && - (!csum || !btrfs_check_data_csum(bi, bbio, offset, - bv.bv_page, bv.bv_offset))) { - btrfs_clean_io_failure(bi, start, bv.bv_page, - bv.bv_offset); - } else { - int ret; - - refcount_inc(&cb->pending_ios); - ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset, - true); - if (ret) { - refcount_dec(&cb->pending_ios); - status = errno_to_blk_status(ret); - } - } - } - - if (status) - cb->status = status; - - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_read(cb); - btrfs_bio_free_csum(bbio); bio_put(&bbio->bio); } @@ -303,68 +258,12 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) static void end_compressed_bio_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = bbio->private; - - if (bbio->bio.bi_status) - cb->status = bbio->bio.bi_status; - - if (refcount_dec_and_test(&cb->pending_ios)) { - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - - btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio); - queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - } - bio_put(&bbio->bio); -} - -/* - * Allocate a compressed_bio, which will be used to read/write on-disk - * (aka, compressed) * data. - * - * @cb: The compressed_bio structure, which records all the needed - * information to bind the compressed data to the uncompressed - * page cache. - * @disk_byten: The logical bytenr where the compressed data will be read - * from or written to. - * @endio_func: The endio function to call after the IO for compressed data - * is finished. - * @next_stripe_start: Return value of logical bytenr of where next stripe starts. - * Let the caller know to only fill the bio up to the stripe - * boundary. - */ - - -static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, - blk_opf_t opf, - btrfs_bio_end_io_t endio_func, - u64 *next_stripe_start) -{ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - struct btrfs_io_geometry geom; - struct extent_map *em; - struct bio *bio; - int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb); - bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + cb->status = bbio->bio.bi_status; + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); - if (IS_ERR(em)) { - bio_put(bio); - return ERR_CAST(em); - } - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); - - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom); - free_extent_map(em); - if (ret < 0) { - bio_put(bio); - return ERR_PTR(ret); - } - *next_stripe_start = disk_bytenr + geom.len; - refcount_inc(&cb->pending_ios); - return bio; + bio_put(&bbio->bio); } /* @@ -389,18 +288,13 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct bio *bio = NULL; struct compressed_bio *cb; u64 cur_disk_bytenr = disk_start; - u64 next_stripe_start; blk_status_t ret = BLK_STS_OK; - int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; - const bool use_append = btrfs_use_zone_append(inode, disk_start); - const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; @@ -411,8 +305,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; - if (blkcg_css) + if (blkcg_css) { kthread_associate_blkcg(blkcg_css); + write_flags |= REQ_CGROUP_PUNT; + } + + write_flags |= REQ_BTRFS_ONE_ORDERED; + bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags, + BTRFS_I(cb->inode), end_compressed_bio_write, cb); + bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT; + btrfs_bio(bio)->file_offset = start; while (cur_disk_bytenr < disk_start + compressed_len) { u64 offset = cur_disk_bytenr - disk_start; @@ -420,77 +322,30 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int real_size; unsigned int added; struct page *page = compressed_pages[index]; - bool submit = false; - - /* Allocate new bio if submitted or not yet allocated */ - if (!bio) { - bio = alloc_compressed_bio(cb, cur_disk_bytenr, - bio_op | write_flags, end_compressed_bio_write, - &next_stripe_start); - if (IS_ERR(bio)) { - ret = errno_to_blk_status(PTR_ERR(bio)); - break; - } - if (blkcg_css) - bio->bi_opf |= REQ_CGROUP_PUNT; - } - /* - * We should never reach next_stripe_start start as we will - * submit comp_bio when reach the boundary immediately. - */ - ASSERT(cur_disk_bytenr != next_stripe_start); /* * We have various limits on the real read size: - * - stripe boundary * - page boundary * - compressed length boundary */ - real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr); - real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); real_size = min_t(u64, real_size, compressed_len - offset); ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); - if (use_append) - added = bio_add_zone_append_page(bio, page, real_size, - offset_in_page(offset)); - else - added = bio_add_page(bio, page, real_size, - offset_in_page(offset)); - /* Reached zoned boundary */ - if (added == 0) - submit = true; - + added = bio_add_page(bio, page, real_size, offset_in_page(offset)); + /* + * Maximum compressed extent is smaller than bio size limit, + * thus bio_add_page() should always success. + */ + ASSERT(added == real_size); cur_disk_bytenr += added; - /* Reached stripe boundary */ - if (cur_disk_bytenr == next_stripe_start) - submit = true; - - /* Finished the range */ - if (cur_disk_bytenr == disk_start + compressed_len) - submit = true; - - if (submit) { - if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, true); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - break; - } - } - - ASSERT(bio->bi_iter.bi_size); - btrfs_submit_bio(fs_info, bio, 0); - bio = NULL; - } - cond_resched(); } + /* Finished the range. */ + ASSERT(bio->bi_iter.bi_size); + btrfs_submit_bio(bio, 0); if (blkcg_css) kthread_associate_blkcg(NULL); - - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_write(cb); return ret; } @@ -667,10 +522,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned int compressed_len; - struct bio *comp_bio = NULL; + struct bio *comp_bio; const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_byte = disk_bytenr; - u64 next_stripe_start; u64 file_offset; u64 em_len; u64 em_start; @@ -703,7 +557,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto out; } - refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = inode; @@ -737,37 +590,23 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; + comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode), + end_compressed_bio_read, cb); + comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT); + while (cur_disk_byte < disk_bytenr + compressed_len) { u64 offset = cur_disk_byte - disk_bytenr; unsigned int index = offset >> PAGE_SHIFT; unsigned int real_size; unsigned int added; struct page *page = cb->compressed_pages[index]; - bool submit = false; - - /* Allocate new bio if submitted or not yet allocated */ - if (!comp_bio) { - comp_bio = alloc_compressed_bio(cb, cur_disk_byte, - REQ_OP_READ, end_compressed_bio_read, - &next_stripe_start); - if (IS_ERR(comp_bio)) { - cb->status = errno_to_blk_status(PTR_ERR(comp_bio)); - break; - } - } - /* - * We should never reach next_stripe_start start as we will - * submit comp_bio when reach the boundary immediately. - */ - ASSERT(cur_disk_byte != next_stripe_start); + /* * We have various limit on the real read size: - * - stripe boundary * - page boundary * - compressed length boundary */ - real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte); - real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); real_size = min_t(u64, real_size, compressed_len - offset); ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); @@ -778,45 +617,20 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ ASSERT(added == real_size); cur_disk_byte += added; - - /* Reached stripe boundary, need to submit */ - if (cur_disk_byte == next_stripe_start) - submit = true; - - /* Has finished the range, need to submit */ - if (cur_disk_byte == disk_bytenr + compressed_len) - submit = true; - - if (submit) { - /* Save the original iter for read repair */ - if (bio_op(comp_bio) == REQ_OP_READ) - btrfs_bio(comp_bio)->iter = comp_bio->bi_iter; - - /* - * Save the initial offset of this chunk, as there - * is no direct correlation between compressed pages and - * the original file offset. The field is only used for - * priting error messages. - */ - btrfs_bio(comp_bio)->file_offset = file_offset; - - ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); - if (ret) { - btrfs_bio_end_io(btrfs_bio(comp_bio), ret); - break; - } - - ASSERT(comp_bio->bi_iter.bi_size); - btrfs_submit_bio(fs_info, comp_bio, mirror_num); - comp_bio = NULL; - } } if (memstall) psi_memstall_leave(&pflags); - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_read(cb); + /* + * Stash the initial offset of this chunk, as there is no direct + * correlation between compressed pages and the original file offset. + * The field is only used for printing error messages anyway. + */ + btrfs_bio(comp_bio)->file_offset = file_offset; + + ASSERT(comp_bio->bi_iter.bi_size); + btrfs_submit_bio(comp_bio, mirror_num); return; fail: @@ -1609,7 +1423,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, index_end = end >> PAGE_SHIFT; /* Don't miss unaligned end */ - if (!IS_ALIGNED(end, PAGE_SIZE)) + if (!PAGE_ALIGNED(end)) index_end++; curr_sample_pos = 0; @@ -1642,7 +1456,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, * * For now is's a naive and optimistic 'return true', we'll extend the logic to * quickly (compared to direct compression) detect data characteristics - * (compressible/uncompressible) to avoid wasting CPU time on uncompressible + * (compressible/incompressible) to avoid wasting CPU time on incompressible * data. * * The following types of analysis can be performed: diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 6209d40a1e08..a5e3377db9ad 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -31,9 +31,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of outstanding bios */ - refcount_t pending_ios; - /* Number of compressed pages in the array */ unsigned int nr_pages; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4754c9101a4c..a5b6bb54545f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -484,7 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (ret) return ret; } - btrfs_clean_tree_block(buf); + btrfs_clear_buffer_dirty(trans, buf); *last_ref = 1; } return 0; @@ -853,8 +853,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, /* * Search for a key in the given extent_buffer. * - * The lower boundary for the search is specified by the slot number @low. Use a - * value of 0 to search over the whole extent buffer. + * The lower boundary for the search is specified by the slot number @first_slot. + * Use a value of 0 to search over the whole extent buffer. * * The slot in the extent buffer is returned via @slot. If the key exists in the * extent buffer, then @slot will point to the slot where the key is, otherwise @@ -863,18 +863,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer. */ -static noinline int generic_bin_search(struct extent_buffer *eb, int low, - const struct btrfs_key *key, int *slot) +int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot) { unsigned long p; int item_size; - int high = btrfs_header_nritems(eb); + /* + * Use unsigned types for the low and high slots, so that we get a more + * efficient division in the search loop below. + */ + u32 low = first_slot; + u32 high = btrfs_header_nritems(eb); int ret; const int key_size = sizeof(struct btrfs_disk_key); - if (low > high) { + if (unlikely(low > high)) { btrfs_err(eb->fs_info, - "%s: low (%d) > high (%d) eb %llu owner %llu level %d", + "%s: low (%u) > high (%u) eb %llu owner %llu level %d", __func__, low, high, eb->start, btrfs_header_owner(eb), btrfs_header_level(eb)); return -EINVAL; @@ -925,16 +930,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, int low, return 1; } -/* - * Simple binary search on an extent buffer. Works for both leaves and nodes, and - * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). - */ -int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int *slot) -{ - return generic_bin_search(eb, 0, key, slot); -} - static void root_add_used(struct btrfs_root *root, u32 size) { spin_lock(&root->accounting_lock); @@ -1054,7 +1049,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - btrfs_clean_tree_block(mid); + btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1115,7 +1110,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - btrfs_clean_tree_block(right); + btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -1161,7 +1156,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - btrfs_clean_tree_block(mid); + btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -1869,7 +1864,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb, return 0; } - return generic_bin_search(eb, search_low_slot, key, slot); + return btrfs_generic_bin_search(eb, search_low_slot, key, slot); } static int search_leaf(struct btrfs_trans_handle *trans, @@ -3041,7 +3036,8 @@ noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) * min slot controls the lowest index we're willing to push to the * right. We'll push up to and including min_slot, but no lower */ -static noinline int __push_leaf_right(struct btrfs_path *path, +static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, int free_space, u32 left_nritems, @@ -3139,7 +3135,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (left_nritems) btrfs_mark_buffer_dirty(left); else - btrfs_clean_tree_block(left); + btrfs_clear_buffer_dirty(trans, left); btrfs_mark_buffer_dirty(right); @@ -3151,7 +3147,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - btrfs_clean_tree_block(path->nodes[0]); + btrfs_clear_buffer_dirty(trans, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3243,8 +3239,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 0; } - return __push_leaf_right(path, min_data_size, empty, - right, free_space, left_nritems, min_slot); + return __push_leaf_right(trans, path, min_data_size, empty, right, + free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); free_extent_buffer(right); @@ -3259,7 +3255,8 @@ out_unlock: * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the * items */ -static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, +static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, int free_space, u32 right_nritems, u32 max_slot) @@ -3363,7 +3360,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, if (right_nritems) btrfs_mark_buffer_dirty(right); else - btrfs_clean_tree_block(right); + btrfs_clear_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); fixup_low_keys(path, &disk_key, 1); @@ -3449,9 +3446,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root ret = -EUCLEAN; goto out; } - return __push_leaf_left(path, min_data_size, - empty, left, free_space, right_nritems, - max_slot); + return __push_leaf_left(trans, path, min_data_size, empty, left, + free_space, right_nritems, max_slot); out: btrfs_tree_unlock(left); free_extent_buffer(left); @@ -4400,7 +4396,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - btrfs_clean_tree_block(leaf); + btrfs_clear_buffer_dirty(trans, leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6965703a81b6..97897107fab5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -507,6 +507,21 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); /* ctree.c */ int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); + +int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot); + +/* + * Simple binary search on an extent buffer. Works for both leaves and nodes, and + * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). + */ +static inline int btrfs_bin_search(struct extent_buffer *eb, + const struct btrfs_key *key, + int *slot) +{ + return btrfs_generic_bin_search(eb, 0, key, slot); +} + int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index d81b764a7644..8065341d831a 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -765,7 +765,7 @@ again: break; unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); lock_page(page); /* @@ -999,7 +999,7 @@ next: } #define CLUSTER_SIZE (SZ_256K) -static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); +static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); /* * Defrag one contiguous target range. diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 573ebab886e2..886ffb232eac 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -437,8 +437,7 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, return 0; } -static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, +static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { @@ -452,8 +451,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, atomic_dec(&delayed_refs->num_entries); } -static bool merge_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, +static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) @@ -482,10 +480,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, mod = -next->ref_mod; } - drop_delayed_ref(trans, delayed_refs, head, next); + drop_delayed_ref(delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, head, ref); + drop_delayed_ref(delayed_refs, head, ref); done = true; } else { /* @@ -499,11 +497,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, return done; } -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_node *ref; struct rb_node *node; u64 seq = 0; @@ -524,7 +521,7 @@ again: ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; - if (merge_ref(trans, delayed_refs, head, ref, seq)) + if (merge_ref(delayed_refs, head, ref, seq)) goto again; } } @@ -601,8 +598,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, * Return 0 for insert. * Return >0 for merge. */ -static int insert_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *root, +static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { @@ -641,7 +637,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans, /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) - drop_delayed_ref(trans, root, href, exist); + drop_delayed_ref(root, href, exist); spin_unlock(&href->lock); return ret; inserted: @@ -978,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); + ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -1070,7 +1066,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); + ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d6304b690ec4..2eb34abf700f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -357,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, +void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index ff2e524d9937..317aeff6c1da 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -78,6 +78,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { + lockdep_assert_held(&discard_ctl->lock); if (!btrfs_run_discard_work(discard_ctl)) return; @@ -89,6 +90,8 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, BTRFS_DISCARD_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; } + if (list_empty(&block_group->discard_list)) + btrfs_get_block_group(block_group); list_move_tail(&block_group->discard_list, get_discard_list(discard_ctl, block_group)); @@ -108,8 +111,12 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { + bool queued; + spin_lock(&discard_ctl->lock); + queued = !list_empty(&block_group->discard_list); + if (!btrfs_run_discard_work(discard_ctl)) { spin_unlock(&discard_ctl->lock); return; @@ -121,6 +128,8 @@ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = (ktime_get_ns() + BTRFS_DISCARD_UNUSED_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; + if (!queued) + btrfs_get_block_group(block_group); list_add_tail(&block_group->discard_list, &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); @@ -131,6 +140,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { bool running = false; + bool queued = false; spin_lock(&discard_ctl->lock); @@ -140,7 +150,16 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, } block_group->discard_eligible_time = 0; + queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); + /* + * If the block group is currently running in the discard workfn, we + * don't want to deref it, since it's still being used by the workfn. + * The workfn will notice this case and deref the block group when it is + * finished. + */ + if (queued && !running) + btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -214,10 +233,12 @@ again: if (block_group && now >= block_group->discard_eligible_time) { if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && block_group->used != 0) { - if (btrfs_is_block_group_data_only(block_group)) + if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); - else + } else { list_del_init(&block_group->discard_list); + btrfs_put_block_group(block_group); + } goto again; } if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { @@ -511,6 +532,15 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; + /* + * If the block group was removed from the discard list while it was + * running in this workfn, then we didn't deref it, since this function + * still owned that reference. But we set the discard_ctl->block_group + * back to NULL, so we can use that condition to know that now we need + * to deref the block_group. + */ + if (discard_ctl->block_group == NULL) + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); @@ -651,8 +681,12 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, bg_list) { list_del_init(&block_group->bg_list); - btrfs_put_block_group(block_group); btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + /* + * This put is for the get done by btrfs_mark_bg_unused. + * Queueing discard incremented it for discard's reference. + */ + btrfs_put_block_group(block_group); } spin_unlock(&fs_info->unused_bgs_lock); } @@ -683,6 +717,7 @@ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) if (block_group->used == 0) btrfs_mark_bg_unused(block_group); spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); } } spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3aa04224315e..b53f0e30ce2b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -79,23 +79,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) } /* - * async submit bios are used to offload expensive checksumming - * onto the worker threads. They checksum file and metadata bios - * just before they are sent down the IO stack. - */ -struct async_submit_bio { - struct btrfs_inode *inode; - struct bio *bio; - enum btrfs_wq_submit_cmd submit_cmd; - int mirror_num; - - /* Optional parameter for used by direct io */ - u64 dio_file_offset; - struct btrfs_work work; - blk_status_t status; -}; - -/* * Compute the csum of a btree block and store the result to provided buffer. */ static void csum_tree_block(struct extent_buffer *buf, u8 *result) @@ -455,6 +438,22 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec return csum_one_extent_buffer(eb); } +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) +{ + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct bvec_iter iter; + struct bio_vec bv; + int ret = 0; + + bio_for_each_segment(bv, &bbio->bio, iter) { + ret = csum_dirty_buffer(fs_info, &bv); + if (ret) + break; + } + + return errno_to_blk_status(ret); +} + static int check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; @@ -700,172 +699,6 @@ err: return ret; } -static void run_one_async_start(struct btrfs_work *work) -{ - struct async_submit_bio *async; - blk_status_t ret; - - async = container_of(work, struct async_submit_bio, work); - switch (async->submit_cmd) { - case WQ_SUBMIT_METADATA: - ret = btree_submit_bio_start(async->bio); - break; - case WQ_SUBMIT_DATA: - ret = btrfs_submit_bio_start(async->inode, async->bio); - break; - case WQ_SUBMIT_DATA_DIO: - ret = btrfs_submit_bio_start_direct_io(async->inode, - async->bio, async->dio_file_offset); - break; - } - if (ret) - async->status = ret; -} - -/* - * In order to insert checksums into the metadata in large chunks, we wait - * until bio submission time. All the pages in the bio are checksummed and - * sums are attached onto the ordered extent record. - * - * At IO completion time the csums attached on the ordered extent record are - * inserted into the tree. - */ -static void run_one_async_done(struct btrfs_work *work) -{ - struct async_submit_bio *async = - container_of(work, struct async_submit_bio, work); - struct btrfs_inode *inode = async->inode; - struct btrfs_bio *bbio = btrfs_bio(async->bio); - - /* If an error occurred we just want to clean up the bio and move on */ - if (async->status) { - btrfs_bio_end_io(bbio, async->status); - return; - } - - /* - * All of the bios that pass through here are from async helpers. - * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. - * This changes nothing when cgroups aren't in use. - */ - async->bio->bi_opf |= REQ_CGROUP_PUNT; - btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num); -} - -static void run_one_async_free(struct btrfs_work *work) -{ - struct async_submit_bio *async; - - async = container_of(work, struct async_submit_bio, work); - kfree(async); -} - -/* - * Submit bio to an async queue. - * - * Retrun: - * - true if the work has been succesfuly submitted - * - false in case of error - */ -bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_submit_bio *async; - - async = kmalloc(sizeof(*async), GFP_NOFS); - if (!async) - return false; - - async->inode = inode; - async->bio = bio; - async->mirror_num = mirror_num; - async->submit_cmd = cmd; - - btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, - run_one_async_free); - - async->dio_file_offset = dio_file_offset; - - async->status = 0; - - if (op_is_sync(bio->bi_opf)) - btrfs_queue_work(fs_info->hipri_workers, &async->work); - else - btrfs_queue_work(fs_info->workers, &async->work); - return true; -} - -static blk_status_t btree_csum_one_bio(struct bio *bio) -{ - struct bio_vec *bvec; - struct btrfs_root *root; - int ret = 0; - struct bvec_iter_all iter_all; - - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root->fs_info, bvec); - if (ret) - break; - } - - return errno_to_blk_status(ret); -} - -blk_status_t btree_submit_bio_start(struct bio *bio) -{ - /* - * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_submit_bio. - */ - return btree_csum_one_bio(bio); -} - -static bool should_async_write(struct btrfs_fs_info *fs_info, - struct btrfs_inode *bi) -{ - if (btrfs_is_zoned(fs_info)) - return false; - if (atomic_read(&bi->sync_writers)) - return false; - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return false; - return true; -} - -void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_bio *bbio = btrfs_bio(bio); - blk_status_t ret; - - bio->bi_opf |= REQ_META; - bbio->is_metadata = 1; - - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - btrfs_submit_bio(fs_info, bio, mirror_num); - return; - } - - /* - * Kthread helpers are used to submit writes so that checksumming can - * happen in parallel across all CPUs. - */ - if (should_async_write(fs_info, inode) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) - return; - - ret = btree_csum_one_bio(bio); - if (ret) { - btrfs_bio_end_io(bbio, ret); - return; - } - - btrfs_submit_bio(fs_info, bio, mirror_num); -} - #ifdef CONFIG_MIGRATION static int btree_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -1035,22 +868,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, } -void btrfs_clean_tree_block(struct extent_buffer *buf) -{ - struct btrfs_fs_info *fs_info = buf->fs_info; - if (btrfs_header_generation(buf) == - fs_info->running_transaction->transid) { - btrfs_assert_tree_write_locked(buf); - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - -buf->len, - fs_info->dirty_metadata_batch); - clear_extent_buffer_dirty(buf); - } - } -} - static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { @@ -1910,6 +1727,9 @@ static int cleaner_kthread(void *arg) goto sleep; } + if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags)) + btrfs_sysfs_feature_update(fs_info); + btrfs_run_delayed_iputs(fs_info); again = btrfs_clean_one_deleted_snapshot(fs_info); @@ -5159,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, start += fs_info->nodesize; if (!eb) continue; + + btrfs_tree_lock(eb); wait_on_extent_buffer_writeback(eb); + btrfs_clear_buffer_dirty(NULL, eb); + btrfs_tree_unlock(eb); - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, - &eb->bflags)) - clear_extent_buffer_dirty(eb); free_extent_buffer_stale(eb); } } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index f2f295eb6103..4d5772330110 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,7 +39,8 @@ struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); -void btrfs_clean_tree_block(struct extent_buffer *buf); +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, @@ -86,7 +87,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -114,15 +114,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_extent_buffer(struct extent_buffer *buf, struct btrfs_tree_parent_check *check); -enum btrfs_wq_submit_cmd { - WQ_SUBMIT_METADATA, - WQ_SUBMIT_DATA, - WQ_SUBMIT_DATA_DIO, -}; - -bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); -blk_status_t btree_submit_bio_start(struct bio *bio); +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 3c7766dfaa69..29a225836e28 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -972,8 +972,8 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, { struct extent_state *state; struct extent_state *prealloc = NULL; - struct rb_node **p; - struct rb_node *parent; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; int err = 0; u64 last_start; u64 last_end; @@ -1218,8 +1218,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, { struct extent_state *state; struct extent_state *prealloc = NULL; - struct rb_node **p; - struct rb_node *parent; + struct rb_node **p = NULL; + struct rb_node *parent = NULL; int err = 0; u64 last_start; u64 last_end; @@ -1625,7 +1625,7 @@ search: } /* - * Searche a range in the state tree for a given mask. If 'filled' == 1, this + * Search a range in the state tree for a given mask. If 'filled' == 1, this * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 * is returned if any bit in the range is found set. */ diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index e3eeec380844..21766e49ec02 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -6,7 +6,6 @@ #include "misc.h" struct extent_changeset; -struct io_failure_record; /* Bits for the extent state */ enum { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72ba13b027a9..824c657f59e8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -16,7 +16,8 @@ #include <linux/percpu_counter.h> #include <linux/lockdep.h> #include <linux/crc32c.h> -#include "misc.h" +#include "ctree.h" +#include "extent-tree.h" #include "tree-log.h" #include "disk-io.h" #include "print-tree.h" @@ -31,14 +32,12 @@ #include "space-info.h" #include "block-rsv.h" #include "delalloc-space.h" -#include "block-group.h" #include "discard.h" #include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" #include "accessors.h" -#include "extent-tree.h" #include "root-tree.h" #include "file-item.h" #include "orphan.h" @@ -1966,7 +1965,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, cond_resched(); spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); } return 0; @@ -2013,7 +2012,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * insert_inline_extent_backref()). */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &actual_count); @@ -3385,7 +3384,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) enum btrfs_loop_type { LOOP_CACHING_NOWAIT, LOOP_CACHING_WAIT, + LOOP_UNSET_SIZE_CLASS, LOOP_ALLOC_CHUNK, + LOOP_WRONG_SIZE_CLASS, LOOP_NO_EMPTY_SIZE, }; @@ -3453,81 +3454,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } -enum btrfs_extent_allocation_policy { - BTRFS_EXTENT_ALLOC_CLUSTERED, - BTRFS_EXTENT_ALLOC_ZONED, -}; - -/* - * Structure used internally for find_free_extent() function. Wraps needed - * parameters. - */ -struct find_free_extent_ctl { - /* Basic allocation info */ - u64 ram_bytes; - u64 num_bytes; - u64 min_alloc_size; - u64 empty_size; - u64 flags; - int delalloc; - - /* Where to start the search inside the bg */ - u64 search_start; - - /* For clustered allocation */ - u64 empty_cluster; - struct btrfs_free_cluster *last_ptr; - bool use_cluster; - - bool have_caching_bg; - bool orig_have_caching_bg; - - /* Allocation is called for tree-log */ - bool for_treelog; - - /* Allocation is called for data relocation */ - bool for_data_reloc; - - /* RAID index, converted from flags */ - int index; - - /* - * Current loop number, check find_free_extent_update_loop() for details - */ - int loop; - - /* - * Whether we're refilling a cluster, if true we need to re-search - * current block group but don't try to refill the cluster again. - */ - bool retry_clustered; - - /* - * Whether we're updating free space cache, if true we need to re-search - * current block group but don't try updating free space cache again. - */ - bool retry_unclustered; - - /* If current block group is cached */ - int cached; - - /* Max contiguous hole found */ - u64 max_extent_size; - - /* Total free space from free space cache, not always contiguous */ - u64 total_free_space; - - /* Found result */ - u64 found_offset; - - /* Hint where to start looking for an empty space */ - u64 hint_byte; - - /* Allocation policy */ - enum btrfs_extent_allocation_policy policy; -}; - - /* * Helper function for find_free_extent(). * @@ -3559,8 +3485,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (offset) { /* We have a block, we're done */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(cluster_bg, - ffe_ctl->search_start, ffe_ctl->num_bytes); + trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); *cluster_bg_ret = cluster_bg; ffe_ctl->found_offset = offset; return 0; @@ -3610,10 +3535,8 @@ refill_cluster: if (offset) { /* We found one, proceed */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(bg, - ffe_ctl->search_start, - ffe_ctl->num_bytes); ffe_ctl->found_offset = offset; + trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && @@ -4028,24 +3951,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, } } -static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) -{ - switch (ffe_ctl->policy) { - case BTRFS_EXTENT_ALLOC_CLUSTERED: - /* - * If we can't allocate a new chunk we've already looped through - * at least once, move on to the NO_EMPTY_SIZE case. - */ - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - return 0; - case BTRFS_EXTENT_ALLOC_ZONED: - /* Give up here */ - return -ENOSPC; - default: - BUG(); - } -} - /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. @@ -4079,31 +3984,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_UNSET_SIZE_CLASS, allow unset size class * LOOP_ALLOC_CHUNK, force a chunk allocation and try again * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try * again */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; - if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (ffe_ctl->orig_have_caching_bg || !full_search) - ffe_ctl->loop = LOOP_CACHING_WAIT; - else - ffe_ctl->loop = LOOP_ALLOC_CHUNK; - } else { + /* + * We want to skip the LOOP_CACHING_WAIT step if we don't have + * any uncached bgs and we've already done a full search + * through. + */ + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && + (!ffe_ctl->orig_have_caching_bg && full_search)) ffe_ctl->loop++; - } + ffe_ctl->loop++; if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; - /*Check if allocation policy allows to create a new chunk */ + /* Check if allocation policy allows to create a new chunk */ ret = can_allocate_chunk(fs_info, ffe_ctl); if (ret) return ret; @@ -4123,8 +4025,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ - if (ret == -ENOSPC) - ret = chunk_allocation_failed(ffe_ctl); + if (ret == -ENOSPC) { + ret = 0; + ffe_ctl->loop++; + } else if (ret < 0) btrfs_abort_transaction(trans, ret); else @@ -4154,6 +4058,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } +static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_block_group_should_use_size_class(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} + static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4288,6 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->total_free_space = 0; ffe_ctl->found_offset = 0; ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); if (btrfs_is_zoned(fs_info)) ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; @@ -4296,8 +4216,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, - ffe_ctl->flags); + trace_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); if (!space_info) { @@ -4340,6 +4259,7 @@ static noinline int find_free_extent(struct btrfs_root *root, block_group->flags); btrfs_lock_block_group(block_group, ffe_ctl->delalloc); + ffe_ctl->hinted = true; goto have_block_group; } } else if (block_group) { @@ -4347,6 +4267,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: + trace_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) @@ -4356,6 +4277,7 @@ search: &space_info->block_groups[ffe_ctl->index], list) { struct btrfs_block_group *bg_ret; + ffe_ctl->hinted = false; /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) { if (ffe_ctl->for_treelog) @@ -4397,6 +4319,7 @@ search: } have_block_group: + trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; @@ -4421,6 +4344,9 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; + if (!find_free_extent_check_size_class(ffe_ctl, block_group)) + goto loop; + bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); if (ret == 0) { @@ -4455,7 +4381,8 @@ have_block_group: ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, ffe_ctl->num_bytes, - ffe_ctl->delalloc); + ffe_ctl->delalloc, + ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, @@ -4468,8 +4395,7 @@ have_block_group: ins->objectid = ffe_ctl->search_start; ins->offset = ffe_ctl->num_bytes; - trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, - ffe_ctl->num_bytes); + trace_btrfs_reserve_extent(block_group, ffe_ctl); btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: @@ -4912,7 +4838,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); __btrfs_tree_lock(buf, nest); - btrfs_clean_tree_block(buf); + btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); @@ -5542,13 +5468,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } } } - /* make block locked assertion in btrfs_clean_tree_block happy */ - if (!path->locks[level] && - btrfs_header_generation(eb) == trans->transid) { + /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ + if (!path->locks[level]) { btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; } - btrfs_clean_tree_block(eb); + btrfs_clear_buffer_dirty(trans, eb); } if (eb == root->node) { diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index ae5425253603..0c958fc1b3b8 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -3,6 +3,87 @@ #ifndef BTRFS_EXTENT_TREE_H #define BTRFS_EXTENT_TREE_H +#include "misc.h" +#include "block-group.h" + +struct btrfs_free_cluster; + +enum btrfs_extent_allocation_policy { + BTRFS_EXTENT_ALLOC_CLUSTERED, + BTRFS_EXTENT_ALLOC_ZONED, +}; + +struct find_free_extent_ctl { + /* Basic allocation info */ + u64 ram_bytes; + u64 num_bytes; + u64 min_alloc_size; + u64 empty_size; + u64 flags; + int delalloc; + + /* Where to start the search inside the bg */ + u64 search_start; + + /* For clustered allocation */ + u64 empty_cluster; + struct btrfs_free_cluster *last_ptr; + bool use_cluster; + + bool have_caching_bg; + bool orig_have_caching_bg; + + /* Allocation is called for tree-log */ + bool for_treelog; + + /* Allocation is called for data relocation */ + bool for_data_reloc; + + /* RAID index, converted from flags */ + int index; + + /* + * Current loop number, check find_free_extent_update_loop() for details + */ + int loop; + + /* + * Whether we're refilling a cluster, if true we need to re-search + * current block group but don't try to refill the cluster again. + */ + bool retry_clustered; + + /* + * Whether we're updating free space cache, if true we need to re-search + * current block group but don't try updating free space cache again. + */ + bool retry_unclustered; + + /* If current block group is cached */ + int cached; + + /* Max contiguous hole found */ + u64 max_extent_size; + + /* Total free space from free space cache, not always contiguous */ + u64 total_free_space; + + /* Found result */ + u64 found_offset; + + /* Hint where to start looking for an empty space */ + u64 hint_byte; + + /* Allocation policy */ + enum btrfs_extent_allocation_policy policy; + + /* Whether or not the allocator is currently following a hint */ + bool hinted; + + /* Size class of block groups to prefer in early loops */ + enum btrfs_block_group_size_class size_class; +}; + enum btrfs_inline_ref_type { BTRFS_REF_TYPE_INVALID, BTRFS_REF_TYPE_BLOCK, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9bd32daa9b9a..40300e8e5f99 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -36,6 +36,7 @@ #include "file.h" #include "dev-replace.h" #include "super.h" +#include "transaction.h" static struct kmem_cache *extent_buffer_cache; @@ -99,7 +100,6 @@ struct btrfs_bio_ctrl { struct bio *bio; int mirror_num; enum btrfs_compression_type compress_type; - u32 len_to_stripe_boundary; u32 len_to_oe_boundary; btrfs_bio_end_io_t end_io_func; @@ -126,7 +126,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct bio *bio; struct bio_vec *bv; - struct btrfs_inode *inode; + struct inode *inode; int mirror_num; if (!bio_ctrl->bio) @@ -134,15 +134,13 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio = bio_ctrl->bio; bv = bio_first_bvec_all(bio); - inode = BTRFS_I(bv->bv_page->mapping->host); + inode = bv->bv_page->mapping->host; mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); - btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; - - if (!is_data_inode(&inode->vfs_inode)) { + if (!is_data_inode(inode)) { if (btrfs_op(bio) != BTRFS_MAP_WRITE) { /* * For metadata read, we should have the parent_check, @@ -153,14 +151,15 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio_ctrl->parent_check, sizeof(struct btrfs_tree_parent_check)); } - btrfs_submit_metadata_bio(inode, bio, mirror_num); - } else if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - btrfs_submit_data_write_bio(inode, bio, mirror_num); - } else { - btrfs_submit_data_read_bio(inode, bio, mirror_num, - bio_ctrl->compress_type); + bio->bi_opf |= REQ_META; } + if (btrfs_op(bio) == BTRFS_MAP_READ && + bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) + btrfs_submit_compressed_read(inode, bio, mirror_num); + else + btrfs_submit_bio(bio, mirror_num); + /* The bio is owned by the end_io handler now */ bio_ctrl->bio = NULL; } @@ -515,266 +514,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, start, end, page_ops, NULL); } -static int insert_failrec(struct btrfs_inode *inode, - struct io_failure_record *failrec) -{ - struct rb_node *exist; - - spin_lock(&inode->io_failure_lock); - exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr, - &failrec->rb_node); - spin_unlock(&inode->io_failure_lock); - - return (exist == NULL) ? 0 : -EEXIST; -} - -static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start) -{ - struct rb_node *node; - struct io_failure_record *failrec = ERR_PTR(-ENOENT); - - spin_lock(&inode->io_failure_lock); - node = rb_simple_search(&inode->io_failure_tree, start); - if (node) - failrec = rb_entry(node, struct io_failure_record, rb_node); - spin_unlock(&inode->io_failure_lock); - return failrec; -} - -static void free_io_failure(struct btrfs_inode *inode, - struct io_failure_record *rec) -{ - spin_lock(&inode->io_failure_lock); - rb_erase(&rec->rb_node, &inode->io_failure_tree); - spin_unlock(&inode->io_failure_lock); - - kfree(rec); -} - -static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) -{ - if (cur_mirror == failrec->num_copies) - return cur_mirror + 1 - failrec->num_copies; - return cur_mirror + 1; -} - -static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) -{ - if (cur_mirror == 1) - return failrec->num_copies; - return cur_mirror - 1; -} - -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_io_tree *io_tree = &inode->io_tree; - u64 ino = btrfs_ino(inode); - u64 locked_start, locked_end; - struct io_failure_record *failrec; - int mirror; - int ret; - - failrec = get_failrec(inode, start); - if (IS_ERR(failrec)) - return 0; - - BUG_ON(!failrec->this_mirror); - - if (sb_rdonly(fs_info->sb)) - goto out; - - ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start, - &locked_end, EXTENT_LOCKED, NULL); - if (ret || locked_start > failrec->bytenr || - locked_end < failrec->bytenr + failrec->len - 1) - goto out; - - mirror = failrec->this_mirror; - do { - mirror = prev_mirror(failrec, mirror); - btrfs_repair_io_failure(fs_info, ino, start, failrec->len, - failrec->logical, page, pg_offset, mirror); - } while (mirror != failrec->failed_mirror); - -out: - free_io_failure(inode, failrec); - return 0; -} - -/* - * Can be called when - * - hold extent lock - * - under ordered extent - * - the inode is freeing - */ -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct io_failure_record *failrec; - struct rb_node *node, *next; - - if (RB_EMPTY_ROOT(&inode->io_failure_tree)) - return; - - spin_lock(&inode->io_failure_lock); - node = rb_simple_search_first(&inode->io_failure_tree, start); - while (node) { - failrec = rb_entry(node, struct io_failure_record, rb_node); - if (failrec->bytenr > end) - break; - - next = rb_next(node); - rb_erase(&failrec->rb_node, &inode->io_failure_tree); - kfree(failrec); - - node = next; - } - spin_unlock(&inode->io_failure_lock); -} - -static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - struct btrfs_bio *bbio, - unsigned int bio_offset) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 start = bbio->file_offset + bio_offset; - struct io_failure_record *failrec; - const u32 sectorsize = fs_info->sectorsize; - int ret; - - failrec = get_failrec(BTRFS_I(inode), start); - if (!IS_ERR(failrec)) { - btrfs_debug(fs_info, - "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", - failrec->logical, failrec->bytenr, failrec->len); - /* - * when data can be on disk more than twice, add to failrec here - * (e.g. with a list for failed_mirror) to make - * clean_io_failure() clean all those errors at once. - */ - ASSERT(failrec->this_mirror == bbio->mirror_num); - ASSERT(failrec->len == fs_info->sectorsize); - return failrec; - } - - failrec = kzalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return ERR_PTR(-ENOMEM); - - RB_CLEAR_NODE(&failrec->rb_node); - failrec->bytenr = start; - failrec->len = sectorsize; - failrec->failed_mirror = bbio->mirror_num; - failrec->this_mirror = bbio->mirror_num; - failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; - - btrfs_debug(fs_info, - "new io failure record logical %llu start %llu", - failrec->logical, start); - - failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); - if (failrec->num_copies == 1) { - /* - * We only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. No - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "cannot repair logical %llu num_copies %d", - failrec->logical, failrec->num_copies); - kfree(failrec); - return ERR_PTR(-EIO); - } - - /* Set the bits in the private failure tree */ - ret = insert_failrec(BTRFS_I(inode), failrec); - if (ret) { - kfree(failrec); - return ERR_PTR(ret); - } - - return failrec; -} - -int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, - u32 bio_offset, struct page *page, unsigned int pgoff, - bool submit_buffered) -{ - u64 start = failed_bbio->file_offset + bio_offset; - struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio *failed_bio = &failed_bbio->bio; - const int icsum = bio_offset >> fs_info->sectorsize_bits; - struct bio *repair_bio; - struct btrfs_bio *repair_bbio; - - btrfs_debug(fs_info, - "repair read error: read error at %llu", start); - - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - - failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset); - if (IS_ERR(failrec)) - return PTR_ERR(failrec); - - /* - * There are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk - * - * Since we're only doing repair for one sector, we only need to get - * a good copy of the failed sector and if we succeed, we have setup - * everything for btrfs_repair_io_failure to do the rest for us. - */ - failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); - if (failrec->this_mirror == failrec->failed_mirror) { - btrfs_debug(fs_info, - "failed to repair num_copies %d this_mirror %d failed_mirror %d", - failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); - free_io_failure(inode, failrec); - return -EIO; - } - - repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io, - failed_bbio->private); - repair_bbio = btrfs_bio(repair_bio); - repair_bbio->file_offset = start; - repair_bio->bi_iter.bi_sector = failrec->logical >> 9; - - if (failed_bbio->csum) { - const u32 csum_size = fs_info->csum_size; - - repair_bbio->csum = repair_bbio->csum_inline; - memcpy(repair_bbio->csum, - failed_bbio->csum + csum_size * icsum, csum_size); - } - - bio_add_page(repair_bio, page, failrec->len, pgoff); - repair_bbio->iter = repair_bio->bi_iter; - - btrfs_debug(fs_info, - "repair read error: submitting new read to mirror %d", - failrec->this_mirror); - - /* - * At this point we have a bio, so any errors from bio submission will - * be handled by the endio on the repair_bio, so we can't return an - * error here. - */ - if (submit_buffered) - btrfs_submit_data_read_bio(inode, repair_bio, - failrec->this_mirror, 0); - else - btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); - - return BLK_STS_OK; -} - static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); @@ -803,79 +542,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } -static void end_sector_io(struct page *page, u64 offset, bool uptodate) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - const u32 sectorsize = inode->root->fs_info->sectorsize; - - end_page_read(page, uptodate, offset, sectorsize); - unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL); -} - -static void submit_data_read_repair(struct inode *inode, - struct btrfs_bio *failed_bbio, - u32 bio_offset, const struct bio_vec *bvec, - unsigned int error_bitmap) -{ - const unsigned int pgoff = bvec->bv_offset; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct page *page = bvec->bv_page; - const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; - const u64 end = start + bvec->bv_len - 1; - const u32 sectorsize = fs_info->sectorsize; - const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; - int i; - - BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); - - /* This repair is only for data */ - ASSERT(is_data_inode(inode)); - - /* We're here because we had some read errors or csum mismatch */ - ASSERT(error_bitmap); - - /* - * We only get called on buffered IO, thus page must be mapped and bio - * must not be cloned. - */ - ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); - - /* Iterate through all the sectors in the range */ - for (i = 0; i < nr_bits; i++) { - const unsigned int offset = i * sectorsize; - bool uptodate = false; - int ret; - - if (!(error_bitmap & (1U << i))) { - /* - * This sector has no error, just end the page read - * and unlock the range. - */ - uptodate = true; - goto next; - } - - ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio, - bio_offset + offset, page, pgoff + offset, - true); - if (!ret) { - /* - * We have submitted the read repair, the page release - * will be handled by the endio function of the - * submitted repair bio. - * Thus we don't need to do any thing here. - */ - continue; - } - /* - * Continue on failed repair, otherwise the remaining sectors - * will not be properly unlocked. - */ -next: - end_sector_io(page, start + offset, uptodate); - } -} - /* lots and lots of room for performance fixes in the end_bio funcs */ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) @@ -919,7 +585,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) u64 start; u64 end; struct bvec_iter_all iter_all; - bool first_bvec = true; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -941,11 +606,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) start = page_offset(page) + bvec->bv_offset; end = start + bvec->bv_len - 1; - if (first_bvec) { - btrfs_record_physical_zoned(inode, start, bio); - first_bvec = false; - } - end_extent_writepage(page, error, start, end); btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); @@ -1093,8 +753,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; - unsigned int error_bitmap = (unsigned int)-1; - bool repair = false; u64 start; u64 end; u32 len; @@ -1126,25 +784,14 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) len = bvec->bv_len; mirror = bbio->mirror_num; - if (likely(uptodate)) { - if (is_data_inode(inode)) { - error_bitmap = btrfs_verify_data_csum(bbio, - bio_offset, page, start, end); - if (error_bitmap) - uptodate = false; - } else { - if (btrfs_validate_metadata_buffer(bbio, - page, start, end, mirror)) - uptodate = false; - } - } + if (uptodate && !is_data_inode(inode) && + btrfs_validate_metadata_buffer(bbio, page, start, end, mirror)) + uptodate = false; if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; - btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0); - /* * Zero out the remaining part if this range straddles * i_size. @@ -1161,19 +808,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) zero_user_segment(page, zero_start, offset_in_page(end) + 1); } - } else if (is_data_inode(inode)) { - /* - * Only try to repair bios that actually made it to a - * device. If the bio failed to be submitted mirror - * is 0 and we need to fail it without retrying. - * - * This also includes the high level bios for compressed - * extents - these never make it to a device and repair - * is already handled on the lower compressed bio. - */ - if (mirror > 0) - repair = true; - } else { + } else if (!is_data_inode(inode)) { struct extent_buffer *eb; eb = find_extent_buffer_readpage(fs_info, page, start); @@ -1182,19 +817,10 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) atomic_dec(&eb->io_pages); } - if (repair) { - /* - * submit_data_read_repair() will handle all the good - * and bad sectors, we just continue to the next bvec. - */ - submit_data_read_repair(inode, bbio, bio_offset, bvec, - error_bitmap); - } else { - /* Update page status and unlock */ - end_page_read(page, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); - } + /* Update page status and unlock. */ + end_page_read(page, uptodate, start, len); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, PageUptodate(page)); ASSERT(bio_offset + len > bio_offset); bio_offset += len; @@ -1202,7 +828,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); - btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -1270,11 +895,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig = false; - int ret; ASSERT(bio); /* The limit should be calculated when bio_ctrl->bio is allocated */ - ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); + ASSERT(bio_ctrl->len_to_oe_boundary); if (bio_ctrl->compress_type != compress_type) return 0; @@ -1310,9 +934,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (!contig) return 0; - real_size = min(bio_ctrl->len_to_oe_boundary, - bio_ctrl->len_to_stripe_boundary) - bio_size; - real_size = min(real_size, size); + real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size); /* * If real_size is 0, never call bio_add_*_page(), as even size is 0, @@ -1321,82 +943,45 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (real_size == 0) return 0; - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); - else - ret = bio_add_page(bio, page, real_size, pg_offset); - - return ret; + return bio_add_page(bio, page, real_size, pg_offset); } -static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, - struct btrfs_inode *inode, u64 file_offset) +static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, + struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_io_geometry geom; struct btrfs_ordered_extent *ordered; - struct extent_map *em; - u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); - int ret; /* - * Pages for compressed extent are never submitted to disk directly, - * thus it has no real boundary, just set them to U32_MAX. - * - * The split happens for real compressed bio, which happens in - * btrfs_submit_compressed_read/write(). + * Limit the extent to the ordered boundary for Zone Append. + * Compressed bios aren't submitted directly, so it doesn't apply to + * them. */ - if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - bio_ctrl->len_to_stripe_boundary = U32_MAX; - return 0; - } - em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); - if (IS_ERR(em)) - return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), - logical, &geom); - free_extent_map(em); - if (ret < 0) { - return ret; - } - if (geom.len > U32_MAX) - bio_ctrl->len_to_stripe_boundary = U32_MAX; - else - bio_ctrl->len_to_stripe_boundary = (u32)geom.len; - - if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - return 0; - } - - /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!ordered) { - bio_ctrl->len_to_oe_boundary = U32_MAX; - return 0; + if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && + btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) { + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (ordered) { + bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, + ordered->file_offset + + ordered->disk_num_bytes - file_offset); + btrfs_put_ordered_extent(ordered); + return; + } } - bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, - ordered->disk_bytenr + ordered->disk_num_bytes - logical); - btrfs_put_ordered_extent(ordered); - return 0; + bio_ctrl->len_to_oe_boundary = U32_MAX; } -static int alloc_new_bio(struct btrfs_inode *inode, - struct btrfs_bio_ctrl *bio_ctrl, - struct writeback_control *wbc, - blk_opf_t opf, - u64 disk_bytenr, u32 offset, u64 file_offset, - enum btrfs_compression_type compress_type) +static void alloc_new_bio(struct btrfs_inode *inode, + struct btrfs_bio_ctrl *bio_ctrl, + struct writeback_control *wbc, blk_opf_t opf, + u64 disk_bytenr, u32 offset, u64 file_offset, + enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; - int ret; - - ASSERT(bio_ctrl->end_io_func); - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL); + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func, + NULL); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. @@ -1405,48 +990,21 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; + btrfs_bio(bio)->file_offset = file_offset; bio_ctrl->bio = bio; bio_ctrl->compress_type = compress_type; - ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); - if (ret < 0) - goto error; + calc_bio_boundaries(bio_ctrl, inode, file_offset); if (wbc) { /* - * For Zone append we need the correct block_device that we are - * going to write to set in the bio to be able to respect the - * hardware limitation. Look it up here: + * Pick the last added device to support cgroup writeback. For + * multi-device file systems this means blk-cgroup policies have + * to always be set on the last added/replaced device. + * This is a bit odd but has been like that for a long time. */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *dev; - - dev = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - goto error; - } - - bio_set_dev(bio, dev->bdev); - } else { - /* - * Otherwise pick the last added device to support - * cgroup writeback. For multi-device file systems this - * means blk-cgroup policies have to always be set on the - * last added/replaced device. This is a bit odd but has - * been like that for a long time. - */ - bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); - } + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, bio); - } else { - ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } - return 0; -error: - bio_ctrl->bio = NULL; - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return ret; } /* @@ -1472,7 +1030,6 @@ static int submit_extent_page(blk_opf_t opf, enum btrfs_compression_type compress_type, bool force_bio_submit) { - int ret = 0; struct btrfs_inode *inode = BTRFS_I(page->mapping->host); unsigned int cur = pg_offset; @@ -1492,12 +1049,9 @@ static int submit_extent_page(blk_opf_t opf, /* Allocate new bio if needed */ if (!bio_ctrl->bio) { - ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, - disk_bytenr, offset, - page_offset(page) + cur, - compress_type); - if (ret < 0) - return ret; + alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr, + offset, page_offset(page) + cur, + compress_type); } /* * We must go through btrfs_bio_add_page() to ensure each @@ -2054,10 +1608,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * find_next_dirty_byte() are all exclusive */ iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - - if (btrfs_use_zone_append(inode, em->block_start)) - op = REQ_OP_ZONE_APPEND; - free_extent_map(em); em = NULL; @@ -2361,13 +1911,6 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) mapping_set_error(page->mapping, -EIO); /* - * If we error out, we should add back the dirty_metadata_bytes - * to make it consistent. - */ - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - eb->len, fs_info->dirty_metadata_batch); - - /* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. * We do this because while the transaction is running and before it's @@ -2865,14 +2408,14 @@ int btree_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + unsigned int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; xa_mark_t tag; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -2895,14 +2438,15 @@ retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag))) { + (nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch))) { unsigned i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(page, wbc, &bio_ctrl, &eb_context); + ret = submit_eb_page(&folio->page, wbc, &bio_ctrl, + &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -2917,7 +2461,7 @@ retry: */ nr_to_write_done = wbc->nr_to_write <= 0; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } if (!scanned && !done) { @@ -2992,8 +2536,8 @@ static int extent_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + unsigned int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; @@ -3013,7 +2557,7 @@ static int extent_write_cache_pages(struct address_space *mapping, if (!igrab(inode)) return 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -3051,14 +2595,14 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, - &index, end, tag))) { + (nr_folios = filemap_get_folios_tag(mapping, &index, + end, tag, &fbatch))) { unsigned i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - done_index = page->index + 1; + done_index = folio->index + folio_nr_pages(folio); /* * At this point we hold neither the i_pages lock nor * the page lock: the page may be truncated or @@ -3066,29 +2610,29 @@ retry: * or even swizzled back from swapper_space to * tmpfs file mapping */ - if (!trylock_page(page)) { + if (!folio_trylock(folio)) { submit_write_bio(bio_ctrl, 0); - lock_page(page); + folio_lock(folio); } - if (unlikely(page->mapping != mapping)) { - unlock_page(page); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); continue; } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) + if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); - wait_on_page_writeback(page); + folio_wait_writeback(folio); } - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); + if (folio_test_writeback(folio) || + !folio_clear_dirty_for_io(folio)) { + folio_unlock(folio); continue; } - ret = __extent_writepage(page, wbc, bio_ctrl); + ret = __extent_writepage(&folio->page, wbc, bio_ctrl); if (ret < 0) { done = 1; break; @@ -3101,7 +2645,7 @@ retry: */ nr_to_write_done = wbc->nr_to_write <= 0; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } if (!scanned && !done) { @@ -3826,6 +3370,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, lockend = round_up(start + len, inode->root->fs_info->sectorsize); prev_extent_end = lockstart; + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); @@ -4019,6 +3564,7 @@ check_eof_delalloc: out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); out: free_extent_state(delalloc_cached_state); btrfs_free_backref_share_ctx(backref_ctx); @@ -4722,12 +4268,25 @@ static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); } -void clear_extent_buffer_dirty(const struct extent_buffer *eb) +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; int i; int num_pages; struct page *page; + btrfs_assert_tree_write_locked(eb); + + if (trans && btrfs_header_generation(eb) != trans->transid) + return; + + if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) + return; + + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, + fs_info->dirty_metadata_batch); + if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a2c82448b2e0..4341ad978fb8 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -11,6 +11,8 @@ #include "ulist.h" #include "misc.h" +struct btrfs_trans_handle; + enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, @@ -60,11 +62,9 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -struct btrfs_bio; struct btrfs_root; struct btrfs_inode; struct btrfs_fs_info; -struct io_failure_record; struct extent_io_tree; struct btrfs_tree_parent_check; @@ -262,7 +262,6 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); -void clear_extent_buffer_dirty(const struct extent_buffer *eb); bool set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); @@ -274,40 +273,13 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, u32 bits_to_clear, unsigned long page_ops); int extent_invalidate_folio(struct extent_io_tree *tree, struct folio *folio, size_t offset); +void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -/* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the sector is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - /* Use rb_simple_node for search/insert */ - struct { - struct rb_node rb_node; - u64 bytenr; - }; - struct page *page; - u64 len; - u64 logical; - int this_mirror; - int failed_mirror; - int num_copies; -}; - -int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, - u32 bio_offset, struct page *page, unsigned int pgoff, - bool submit_buffered); -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); -int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset); - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 5de73466b2ca..41c77a100853 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -380,32 +380,25 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, /* * Lookup the checksum for the read bio in csum tree. * - * @inode: inode that the bio is for. - * @bio: bio to look up. - * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return - * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If - * NULL, the checksum buffer is allocated and returned in - * btrfs_bio(bio)->csum instead. - * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst) +blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_bio *bbio = NULL; + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct bio *bio = &bbio->bio; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_bytenr; - u8 *csum; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; blk_status_t ret = BLK_STS_OK; - if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || + if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) return BLK_STS_OK; @@ -426,21 +419,14 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst if (!path) return BLK_STS_RESOURCE; - if (!dst) { - bbio = btrfs_bio(bio); - - if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); - if (!bbio->csum) { - btrfs_free_path(path); - return BLK_STS_RESOURCE; - } - } else { - bbio->csum = bbio->csum_inline; + if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { + bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); + if (!bbio->csum) { + btrfs_free_path(path); + return BLK_STS_RESOURCE; } - csum = bbio->csum; } else { - csum = dst; + bbio->csum = bbio->csum_inline; } /* @@ -456,7 +442,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst * read from the commit root and sidestep a nasty deadlock * between reading the free space cache and updating the csum tree. */ - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(inode)) { path->search_commit_root = 1; path->skip_locking = 1; } @@ -479,14 +465,15 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX); sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >> fs_info->sectorsize_bits; - csum_dst = csum + sector_offset * csum_size; + csum_dst = bbio->csum + sector_offset * csum_size; count = search_csum_tree(fs_info, path, cur_disk_bytenr, search_len, csum_dst); if (count < 0) { ret = errno_to_blk_status(count); - if (bbio) - btrfs_bio_free_csum(bbio); + if (bbio->csum != bbio->csum_inline) + kfree(bbio->csum); + bbio->csum = NULL; break; } @@ -504,12 +491,13 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst memset(csum_dst, 0, csum_size); count = 1; - if (BTRFS_I(inode)->root->root_key.objectid == + if (inode->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset; int ret; - ret = search_file_offset_in_bio(bio, inode, + ret = search_file_offset_in_bio(bio, + &inode->vfs_inode, cur_disk_bytenr, &file_offset); if (ret) set_extent_bits(io_tree, file_offset, @@ -784,23 +772,16 @@ fail: /* * Calculate checksums of the data contained inside a bio. - * - * @inode: Owner of the data inside the bio - * @bio: Contains the data to be checksummed - * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the - * file offsets are determined from the page offsets in the bio. - * Otherwise, this is the starting file offset of the bio vecs in - * @bio, which must be contiguous. - * @one_ordered: If true, @bio only refers to one ordered extent. */ -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered) +blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) { + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct bio *bio = &bbio->bio; + u64 offset = bbio->file_offset; struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; - const bool use_page_offsets = (offset == (u64)-1); char *data; struct bvec_iter iter; struct bio_vec bvec; @@ -828,9 +809,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { - if (use_page_offsets) - offset = page_offset(bvec.bv_page) + bvec.bv_offset; - if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset); /* @@ -852,7 +830,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - 1); for (i = 0; i < blockcount; i++) { - if (!one_ordered && + if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) && !in_range(offset, ordered->file_offset, ordered->num_bytes)) { unsigned long bytes_left; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 031225668434..cd7f2ae515c0 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -38,7 +38,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); +blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio); int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 num_bytes); @@ -49,8 +49,10 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered); +blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index af046d22300e..5cc5a1faaef5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index c667e878ef1a..4d155a48ec59 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1283,7 +1283,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) list_del(&free_space_root->dirty_list); btrfs_tree_lock(free_space_root->node); - btrfs_clean_tree_block(free_space_root->node); + btrfs_clear_buffer_dirty(trans, free_space_root->node); btrfs_tree_unlock(free_space_root->node); btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), free_space_root->node, 0, 1); diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 5553e1f8afe8..31c1648bc0b4 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } @@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } @@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } @@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, name, flag); } spin_unlock(&fs_info->super_lock); + set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); } } diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 37b86acfcbcf..4c477eae6891 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,7 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/btrfs_tree.h> #include <linux/sizes.h> @@ -125,6 +126,12 @@ enum { */ BTRFS_FS_NO_OVERCOMMIT, + /* + * Indicate if we have some features changed, this is mostly for + * cleaner thread to update the sysfs interface. + */ + BTRFS_FS_FEATURE_CHANGED, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -742,8 +749,10 @@ struct btrfs_fs_info { */ u64 zone_size; - /* Max size to emit ZONE_APPEND write command */ + /* Constraints for ZONE_APPEND commands: */ + struct queue_limits limits; u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98a800b8bd43..6c18dc9a1831 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -84,27 +84,12 @@ struct btrfs_dio_data { }; struct btrfs_dio_private { - struct btrfs_inode *inode; - - /* - * Since DIO can use anonymous page, we cannot use page_offset() to - * grab the file offset, thus need a dedicated member for file offset. - */ + /* Range of I/O */ u64 file_offset; - /* Used for bio::bi_size */ u32 bytes; - /* - * References to this structure. There is one reference per in-flight - * bio plus one while we're still setting up. - */ - refcount_t refs; - - /* Array of checksums */ - u8 *csums; - /* This must be last */ - struct bio bio; + struct btrfs_bio bbio; }; static struct bio_set btrfs_dio_bioset; @@ -228,7 +213,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start, page_end; + u64 page_start = 0, page_end = 0; struct page *page; if (locked_page) { @@ -2536,19 +2521,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } /* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time. All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio) -{ - return btrfs_csum_one_bio(inode, bio, (u64)-1, false); -} - -/* * Split an extent_map at [start, start + len] * * This function is intended to be used only for extract_ordered_extent(). @@ -2663,19 +2635,19 @@ out: return ret; } -static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, - struct bio *bio, loff_t file_offset) +blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio) { + u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bbio->bio.bi_iter.bi_size; + struct btrfs_inode *inode = bbio->inode; struct btrfs_ordered_extent *ordered; - u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 file_len; - u64 len = bio->bi_iter.bi_size; u64 end = start + len; u64 ordered_end; u64 pre, post; int ret = 0; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); + ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset); if (WARN_ON_ONCE(!ordered)) return BLK_STS_IOERR; @@ -2715,7 +2687,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, ret = btrfs_split_ordered_extent(ordered, pre, post); if (ret) goto out; - ret = split_zoned_em(inode, file_offset, file_len, pre, post); + ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post); out: btrfs_put_ordered_extent(ordered); @@ -2723,75 +2695,6 @@ out: return errno_to_blk_status(ret); } -void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = extract_ordered_extent(inode, bio, - page_offset(bio_first_bvec_all(bio)->bv_page)); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } - - /* - * If we need to checksum, and the I/O is not issued by fsync and - * friends, that is ->sync_writers != 0, defer the submission to a - * workqueue to parallelize it. - * - * Csum items for reloc roots have already been cloned at this point, - * so they are handled as part of the no-checksum case. - */ - if (!(inode->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && - !btrfs_is_data_reloc_root(inode->root)) { - if (!atomic_read(&inode->sync_writers) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) - return; - - ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } - btrfs_submit_bio(fs_info, bio, mirror_num); -} - -void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (compress_type != BTRFS_COMPRESS_NONE) { - /* - * btrfs_submit_compressed_read will handle completing the bio - * if there were any errors, so just return here. - */ - btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num); - return; - } - - /* Save the original iter for read repair */ - btrfs_bio(bio)->iter = bio->bi_iter; - - /* - * Lookup bio sums does extra checks around whether we need to csum or - * not, which is why we ignore skip_sum here. - */ - ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - - btrfs_submit_bio(fs_info, bio, mirror_num); -} - /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. @@ -2969,7 +2872,7 @@ again: unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -3259,15 +3162,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - /* A valid bdev implies a write on a sequential zone */ - if (ordered_extent->bdev) { + /* A valid ->physical implies a write on a sequential zone. */ + if (ordered_extent->physical != (u64)-1) { btrfs_rewrite_logical_zoned(ordered_extent); btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } - btrfs_free_io_failure_record(inode, start, end); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; @@ -3474,109 +3375,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of } /* - * check_data_csum - verify checksum of one sector of uncompressed data - * @inode: inode - * @bbio: btrfs_bio which contains the csum + * Verify the checksum of a single data sector. + * + * @bbio: btrfs_io_bio which contains the csum + * @dev: device the sector is on * @bio_offset: offset to the beginning of the bio (in bytes) - * @page: page where is the data to be verified - * @pgoff: offset inside the page + * @bv: bio_vec to check * - * The length of such check is always one sector size. + * Check if the checksum on a data block is valid. When a checksum mismatch is + * detected, report the error and fill the corrupted range with zero. * - * When csum mismatch is detected, we will also report the error and fill the - * corrupted range with zero. (Thus it needs the extra parameters) + * Return %true if the sector is ok or had no checksum to start with, else %false. */ -int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff) +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv) { + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 len = fs_info->sectorsize; + u64 file_offset = bbio->file_offset + bio_offset; + u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; - ASSERT(pgoff + len <= PAGE_SIZE); + ASSERT(bv->bv_len == fs_info->sectorsize); - csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); + if (!bbio->csum) + return true; - if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) + if (btrfs_is_data_reloc_root(inode->root) && + test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, + 1, NULL)) { + /* Skip the range without csum for data reloc inode */ + clear_extent_bits(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM); + return true; + } + + csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); + if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, + csum_expected)) goto zeroit; - return 0; + return true; zeroit: - btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset, - csum, csum_expected, bbio->mirror_num); - if (bbio->device) - btrfs_dev_stat_inc_and_print(bbio->device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - memzero_page(page, pgoff, len); - return -EIO; -} - -/* - * When reads are done, we need to check csums to verify the data is correct. - * if there's a match, we allow the bio to finish. If not, the code in - * extent_io.c will try to find good copies for us. - * - * @bio_offset: offset to the beginning of the bio (in bytes) - * @start: file offset of the range start - * @end: file offset of the range end (inclusive) - * - * Return a bitmap where bit set means a csum mismatch, and bit not set means - * csum match. - */ -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_io_tree *io_tree = &inode->io_tree; - const u32 sectorsize = root->fs_info->sectorsize; - u32 pg_off; - unsigned int result = 0; - - /* - * This only happens for NODATASUM or compressed read. - * Normally this should be covered by above check for compressed read - * or the next check for NODATASUM. Just do a quicker exit here. - */ - if (bbio->csum == NULL) - return 0; - - if (inode->flags & BTRFS_INODE_NODATASUM) - return 0; - - if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) - return 0; - - ASSERT(page_offset(page) <= start && - end <= page_offset(page) + PAGE_SIZE - 1); - for (pg_off = offset_in_page(start); - pg_off < offset_in_page(end); - pg_off += sectorsize, bio_offset += sectorsize) { - u64 file_offset = pg_off + page_offset(page); - int ret; - - if (btrfs_is_data_reloc_root(root) && - test_range_bit(io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM, 1, NULL)) { - /* Skip the range without csum for data reloc inode */ - clear_extent_bits(io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM); - continue; - } - ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); - if (ret < 0) { - const int nr_bit = (pg_off - offset_in_page(start)) >> - root->fs_info->sectorsize_bits; - - result |= (1U << nr_bit); - } - } - return result; + btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, + bbio->mirror_num); + if (dev) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); + memzero_bvec(bv); + return false; } /* @@ -4987,7 +4834,7 @@ again: unlock_extent(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -5281,7 +5128,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) return ret; } -static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -5291,7 +5138,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr if (btrfs_root_readonly(root)) return -EROFS; - err = setattr_prepare(mnt_userns, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) return err; @@ -5302,12 +5149,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr } if (attr->ia_valid) { - setattr_copy(mnt_userns, inode, attr); + setattr_copy(idmap, inode, attr); inode_inc_iversion(inode); err = btrfs_dirty_inode(BTRFS_I(inode)); if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); + err = posix_acl_chmod(idmap, dentry, inode->i_mode); } return err; @@ -5466,8 +5313,6 @@ void btrfs_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); - if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) goto no_delete; @@ -6724,7 +6569,7 @@ out_inode: return err; } -static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -6732,13 +6577,13 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); return btrfs_create_common(dir, dentry, inode); } -static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -6746,7 +6591,7 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; @@ -6837,7 +6682,7 @@ fail: return err; } -static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -6845,7 +6690,7 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode); + inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; return btrfs_create_common(dir, dentry, inode); @@ -7392,7 +7237,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); else ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -7833,10 +7678,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; - - if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) - iomap->flags |= IOMAP_F_ZONE_APPEND; - free_extent_map(em); return 0; @@ -7888,267 +7729,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, return ret; } -static void btrfs_dio_private_put(struct btrfs_dio_private *dip) -{ - /* - * This implies a barrier so that stores to dio_bio->bi_status before - * this and loads of dio_bio->bi_status after this are fully ordered. - */ - if (!refcount_dec_and_test(&dip->refs)) - return; - - if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - btrfs_mark_ordered_io_finished(dip->inode, NULL, - dip->file_offset, dip->bytes, - !dip->bio.bi_status); - } else { - unlock_extent(&dip->inode->io_tree, - dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); - } - - kfree(dip->csums); - bio_endio(&dip->bio); -} - -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ - struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - - BUG_ON(bio_op(bio) == REQ_OP_WRITE); - - refcount_inc(&dip->refs); - btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); -} - -static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, - struct btrfs_bio *bbio, - const bool uptodate) -{ - struct inode *inode = &dip->inode->vfs_inode; - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - blk_status_t err = BLK_STS_OK; - struct bvec_iter iter; - struct bio_vec bv; - u32 offset; - - btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { - u64 start = bbio->file_offset + offset; - - if (uptodate && - (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset))) { - btrfs_clean_io_failure(BTRFS_I(inode), start, - bv.bv_page, bv.bv_offset); - } else { - int ret; - - ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, - bv.bv_page, bv.bv_offset, false); - if (ret) - err = errno_to_blk_status(ret); - } - } - - return err; -} - -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, - struct bio *bio, - u64 dio_file_offset) +static void btrfs_dio_end_io(struct btrfs_bio *bbio) { - return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); -} - -static void btrfs_end_dio_bio(struct btrfs_bio *bbio) -{ - struct btrfs_dio_private *dip = bbio->private; + struct btrfs_dio_private *dip = + container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_inode *inode = bbio->inode; struct bio *bio = &bbio->bio; - blk_status_t err = bio->bi_status; - - if (err) - btrfs_warn(dip->inode->root->fs_info, - "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio_op(bio), - bio->bi_opf, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, err); - - if (bio_op(bio) == REQ_OP_READ) - err = btrfs_check_read_dio_bio(dip, bbio, !err); - - if (err) - dip->bio.bi_status = err; - - btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio); - - bio_put(bio); - btrfs_dio_private_put(dip); -} -static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, - u64 file_offset, int async_submit) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - blk_status_t ret; - - /* Save the original iter for read repair */ - if (btrfs_op(bio) == BTRFS_MAP_READ) - btrfs_bio(bio)->iter = bio->bi_iter; - - if (inode->flags & BTRFS_INODE_NODATASUM) - goto map; + if (bio->bi_status) { + btrfs_warn(inode->root->fs_info, + "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", + btrfs_ino(inode), bio->bi_opf, + dip->file_offset, dip->bytes, bio->bi_status); + } - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - /* Check btrfs_submit_data_write_bio() for async submit rules */ - if (async_submit && !atomic_read(&inode->sync_writers) && - btrfs_wq_submit_bio(inode, bio, 0, file_offset, - WQ_SUBMIT_DATA_DIO)) - return; + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset, + dip->bytes, !bio->bi_status); + else + unlock_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); - /* - * If we aren't doing async submit, calculate the csum of the - * bio now. - */ - ret = btrfs_csum_one_bio(inode, bio, file_offset, false); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } else { - btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, - file_offset - dip->file_offset); - } -map: - btrfs_submit_bio(fs_info, bio, 0); + bbio->bio.bi_private = bbio->private; + iomap_dio_bio_end_io(bio); } -static void btrfs_submit_direct(const struct iomap_iter *iter, - struct bio *dio_bio, loff_t file_offset) +static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, + loff_t file_offset) { + struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_dio_private *dip = - container_of(dio_bio, struct btrfs_dio_private, bio); - struct inode *inode = iter->inode; - const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const bool raid56 = (btrfs_data_alloc_profile(fs_info) & - BTRFS_BLOCK_GROUP_RAID56_MASK); - struct bio *bio; - u64 start_sector; - int async_submit = 0; - u64 submit_len; - u64 clone_offset = 0; - u64 clone_len; - u64 logical; - int ret; - blk_status_t status; - struct btrfs_io_geometry geom; + container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_dio_data *dio_data = iter->private; - struct extent_map *em = NULL; - - dip->inode = BTRFS_I(inode); - dip->file_offset = file_offset; - dip->bytes = dio_bio->bi_iter.bi_size; - refcount_set(&dip->refs, 1); - dip->csums = NULL; - - if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - unsigned int nr_sectors = - (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - - /* - * Load the csums up front to reduce csum tree searches and - * contention when submitting bios. - */ - status = BLK_STS_RESOURCE; - dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); - if (!dip->csums) - goto out_err; - - status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); - if (status != BLK_STS_OK) - goto out_err; - } - - start_sector = dio_bio->bi_iter.bi_sector; - submit_len = dio_bio->bi_iter.bi_size; - - do { - logical = start_sector << 9; - em = btrfs_get_chunk_map(fs_info, logical, submit_len); - if (IS_ERR(em)) { - status = errno_to_blk_status(PTR_ERR(em)); - em = NULL; - goto out_err_em; - } - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), - logical, &geom); - if (ret) { - status = errno_to_blk_status(ret); - goto out_err_em; - } - clone_len = min(submit_len, geom.len); - ASSERT(clone_len <= UINT_MAX); + btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); + bbio->file_offset = file_offset; - /* - * This will never fail as it's passing GPF_NOFS and - * the allocation is backed by btrfs_bioset. - */ - bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len, - btrfs_end_dio_bio, dip); - btrfs_bio(bio)->file_offset = file_offset; - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - status = extract_ordered_extent(BTRFS_I(inode), bio, - file_offset); - if (status) { - bio_put(bio); - goto out_err; - } - } - - ASSERT(submit_len >= clone_len); - submit_len -= clone_len; - - /* - * Increase the count before we submit the bio so we know - * the end IO handler won't happen before we increase the - * count. Otherwise, the dip might get freed before we're - * done setting it up. - * - * We transfer the initial reference to the last bio, so we - * don't need to increment the reference count for the last one. - */ - if (submit_len > 0) { - refcount_inc(&dip->refs); - /* - * If we are submitting more than one bio, submit them - * all asynchronously. The exception is RAID 5 or 6, as - * asynchronous checksums make it difficult to collect - * full stripe writes. - */ - if (!raid56) - async_submit = 1; - } - - btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); - - dio_data->submitted += clone_len; - clone_offset += clone_len; - start_sector += clone_len >> 9; - file_offset += clone_len; - - free_extent_map(em); - } while (submit_len > 0); - return; + dip->file_offset = file_offset; + dip->bytes = bio->bi_iter.bi_size; -out_err_em: - free_extent_map(em); -out_err: - dio_bio->bi_status = status; - btrfs_dio_private_put(dip); + dio_data->submitted += bio->bi_iter.bi_size; + btrfs_submit_bio(bio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { @@ -8157,7 +7778,7 @@ static const struct iomap_ops btrfs_dio_iomap_ops = { }; static const struct iomap_dio_ops btrfs_dio_ops = { - .submit_io = btrfs_submit_direct, + .submit_io = btrfs_dio_submit_io, .bio_set = &btrfs_dio_bioset, }; @@ -8552,7 +8173,7 @@ again: unlock_extent(io_tree, page_start, page_end, &cached_state); unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } @@ -8802,7 +8423,7 @@ out: return ret; } -struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, +struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, struct inode *dir) { struct inode *inode; @@ -8813,7 +8434,7 @@ struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, * Subvolumes don't inherit the sgid bit or the parent's gid if * the parent's sgid bit is set. This is probably a bug. */ - inode_init_owner(mnt_userns, inode, NULL, + inode_init_owner(idmap, inode, NULL, S_IFDIR | (~current_umask() & S_IRWXUGO)); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; @@ -8850,7 +8471,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_log_commit = 0; spin_lock_init(&ei->lock); - spin_lock_init(&ei->io_failure_lock); ei->outstanding_extents = 0; if (sb->s_magic != BTRFS_TEST_MAGIC) btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, @@ -8870,7 +8490,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); - ei->io_failure_tree = RB_ROOT; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -8994,7 +8613,7 @@ int __init btrfs_init_cachep(void) goto fail; if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_dio_private, bio), + offsetof(struct btrfs_dio_private, bbio.bio), BIOSET_NEED_BVECS)) goto fail; @@ -9004,7 +8623,7 @@ fail: return -ENOMEM; } -static int btrfs_getattr(struct user_namespace *mnt_userns, +static int btrfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -9034,7 +8653,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -9289,14 +8908,14 @@ out_notrans: return ret; } -static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns, +static struct inode *new_whiteout_inode(struct mnt_idmap *idmap, struct inode *dir) { struct inode *inode; inode = new_inode(dir->i_sb); if (inode) { - inode_init_owner(mnt_userns, inode, dir, + inode_init_owner(idmap, inode, dir, S_IFCHR | WHITEOUT_MODE); inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); @@ -9304,7 +8923,7 @@ static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns, return inode; } -static int btrfs_rename(struct user_namespace *mnt_userns, +static int btrfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -9376,7 +8995,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, filemap_flush(old_inode->i_mapping); if (flags & RENAME_WHITEOUT) { - whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir); + whiteout_args.inode = new_whiteout_inode(idmap, old_dir); if (!whiteout_args.inode) { ret = -ENOMEM; goto out_fscrypt_names; @@ -9545,7 +9164,7 @@ out_fscrypt_names: return ret; } -static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, +static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -9558,7 +9177,7 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); else - ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, + ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); @@ -9758,7 +9377,7 @@ out: return ret; } -static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -9786,7 +9405,7 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO); + inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO); inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_aops; @@ -10075,7 +9694,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } -static int btrfs_permission(struct user_namespace *mnt_userns, +static int btrfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -10088,10 +9707,10 @@ static int btrfs_permission(struct user_namespace *mnt_userns, if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) return -EACCES; } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } -static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -10109,7 +9728,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; @@ -10289,65 +9908,13 @@ struct btrfs_encoded_read_private { wait_queue_head_t wait; atomic_t pending; blk_status_t status; - bool skip_csum; }; -static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, - struct bio *bio, int mirror_num) -{ - struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (!priv->skip_csum) { - ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); - if (ret) - return ret; - } - - atomic_inc(&priv->pending); - btrfs_submit_bio(fs_info, bio, mirror_num); - return BLK_STS_OK; -} - -static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) -{ - const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); - struct btrfs_encoded_read_private *priv = bbio->private; - struct btrfs_inode *inode = priv->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 sectorsize = fs_info->sectorsize; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - u32 bio_offset = 0; - - if (priv->skip_csum || !uptodate) - return bbio->bio.bi_status; - - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { - unsigned int i, nr_sectors, pgoff; - - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - pgoff = bvec->bv_offset; - for (i = 0; i < nr_sectors; i++) { - ASSERT(pgoff < PAGE_SIZE); - if (btrfs_check_data_csum(inode, bbio, bio_offset, - bvec->bv_page, pgoff)) - return BLK_STS_IOERR; - bio_offset += sectorsize; - pgoff += sectorsize; - } - } - return BLK_STS_OK; -} - static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) { struct btrfs_encoded_read_private *priv = bbio->private; - blk_status_t status; - status = btrfs_encoded_read_verify_csum(bbio); - if (status) { + if (bbio->bio.bi_status) { /* * The memory barrier implied by the atomic_dec_return() here * pairs with the memory barrier implied by the @@ -10356,11 +9923,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) * write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ - WRITE_ONCE(priv->status, status); + WRITE_ONCE(priv->status, bbio->bio.bi_status); } if (!atomic_dec_return(&priv->pending)) wake_up(&priv->wait); - btrfs_bio_free_csum(bbio); bio_put(&bbio->bio); } @@ -10368,47 +9934,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 disk_io_size, struct page **pages) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { .inode = inode, .file_offset = file_offset, .pending = ATOMIC_INIT(1), - .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), }; unsigned long i = 0; u64 cur = 0; - int ret; init_waitqueue_head(&priv.wait); - /* - * Submit bios for the extent, splitting due to bio or stripe limits as - * necessary. - */ + /* Submit bios for the extent, splitting due to bio limits as necessary. */ while (cur < disk_io_size) { - struct extent_map *em; - struct btrfs_io_geometry geom; struct bio *bio = NULL; - u64 remaining; + u64 remaining = disk_io_size - cur; - em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, - disk_io_size - cur); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - } else { - ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, - disk_bytenr + cur, &geom); - free_extent_map(em); - } - if (ret) { - WRITE_ONCE(priv.status, errno_to_blk_status(ret)); - break; - } - remaining = min(geom.len, disk_io_size - cur); while (bio || remaining) { size_t bytes = min_t(u64, remaining, PAGE_SIZE); if (!bio) { bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, + inode, btrfs_encoded_read_endio, &priv); bio->bi_iter.bi_sector = @@ -10417,14 +9962,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (!bytes || bio_add_page(bio, pages[i], bytes, 0) < bytes) { - blk_status_t status; - - status = submit_encoded_read_bio(inode, bio, 0); - if (status) { - WRITE_ONCE(priv.status, status); - bio_put(bio); - goto out; - } + atomic_inc(&priv.pending); + btrfs_submit_bio(bio, 0); bio = NULL; continue; } @@ -10435,7 +9974,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, } } -out: if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); /* See btrfs_encoded_read_endio() for ordering. */ @@ -10995,9 +10533,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, return 0; max_pages = sis->max - bsi->nr_pages; - first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; - next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, - PAGE_SIZE) >> PAGE_SHIFT; + first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; + next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; if (first_ppage >= next_ppage) return 0; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7e348bd2ccde..84626c8ad5bf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -243,7 +243,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int btrfs_fileattr_set(struct user_namespace *mnt_userns, +int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -578,7 +578,7 @@ static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit return num_items; } -static noinline int create_subvol(struct user_namespace *mnt_userns, +static noinline int create_subvol(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct btrfs_qgroup_inherit *inherit) { @@ -623,7 +623,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, if (ret < 0) goto out_root_item; - new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir); + new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir); if (!new_inode_args.inode) { ret = -ENOMEM; goto out_anon_dev; @@ -707,7 +707,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, * exists). */ btrfs_tree_lock(leaf); - btrfs_clean_tree_block(leaf); + btrfs_clear_buffer_dirty(trans, leaf); btrfs_tree_unlock(leaf); btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); @@ -898,7 +898,7 @@ free_pending: * nfs_async_unlink(). */ -static int btrfs_may_delete(struct user_namespace *mnt_userns, +static int btrfs_may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, int isdir) { int error; @@ -909,12 +909,12 @@ static int btrfs_may_delete(struct user_namespace *mnt_userns, BUG_ON(d_inode(victim->d_parent) != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(mnt_userns, dir, d_inode(victim)) || + if (check_sticky(idmap, dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) return -EPERM; @@ -933,16 +933,16 @@ static int btrfs_may_delete(struct user_namespace *mnt_userns, } /* copy of may_create in fs/namei.c() */ -static inline int btrfs_may_create(struct user_namespace *mnt_userns, +static inline int btrfs_may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns)) + if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; - return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } /* @@ -951,7 +951,7 @@ static inline int btrfs_may_create(struct user_namespace *mnt_userns, * inside this filesystem so it's quite a bit simpler. */ static noinline int btrfs_mksubvol(const struct path *parent, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const char *name, int namelen, struct btrfs_root *snap_src, bool readonly, @@ -967,12 +967,12 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (error == -EINTR) return error; - dentry = lookup_one(mnt_userns, name, parent->dentry, namelen); + dentry = lookup_one(idmap, name, parent->dentry, namelen); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; - error = btrfs_may_create(mnt_userns, dir, dentry); + error = btrfs_may_create(idmap, dir, dentry); if (error) goto out_dput; @@ -993,7 +993,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (snap_src) error = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(mnt_userns, dir, dentry, inherit); + error = create_subvol(idmap, dir, dentry, inherit); if (!error) fsnotify_mkdir(dir, dentry); @@ -1007,7 +1007,7 @@ out_unlock: } static noinline int btrfs_mksnapshot(const struct path *parent, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const char *name, int namelen, struct btrfs_root *root, bool readonly, @@ -1037,7 +1037,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent, btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); - ret = btrfs_mksubvol(parent, mnt_userns, name, namelen, + ret = btrfs_mksubvol(parent, idmap, name, namelen, root, readonly, inherit); out: if (snapshot_force_cow) @@ -1240,7 +1240,7 @@ out_drop: } static noinline int __btrfs_ioctl_snap_create(struct file *file, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const char *name, unsigned long fd, int subvol, bool readonly, struct btrfs_qgroup_inherit *inherit) @@ -1268,7 +1268,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, } if (subvol) { - ret = btrfs_mksubvol(&file->f_path, mnt_userns, name, + ret = btrfs_mksubvol(&file->f_path, idmap, name, namelen, NULL, readonly, inherit); } else { struct fd src = fdget(fd); @@ -1283,14 +1283,14 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; - } else if (!inode_owner_or_capable(mnt_userns, src_inode)) { + } else if (!inode_owner_or_capable(idmap, src_inode)) { /* * Subvolume creation is not restricted, but snapshots * are limited to own subvolumes only */ ret = -EPERM; } else { - ret = btrfs_mksnapshot(&file->f_path, mnt_userns, + ret = btrfs_mksnapshot(&file->f_path, idmap, name, namelen, BTRFS_I(src_inode)->root, readonly, inherit); @@ -1317,7 +1317,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file), + ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, false, NULL); @@ -1377,7 +1377,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, } } - ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file), + ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, readonly, inherit); if (ret) @@ -1422,7 +1422,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, u64 flags; int ret = 0; - if (!inode_owner_or_capable(file_mnt_user_ns(file), inode)) + if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) return -EPERM; ret = mnt_want_write_file(file); @@ -1870,7 +1870,7 @@ out: return ret; } -static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, +static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct inode *inode, struct btrfs_ioctl_ino_lookup_user_args *args) { @@ -1962,7 +1962,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(mnt_userns, temp_inode, + ret = inode_permission(idmap, temp_inode, MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) { @@ -2101,7 +2101,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) return -EACCES; } - ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args); + ret = btrfs_search_path_in_tree_user(file_mnt_idmap(file), inode, args); if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; @@ -2335,7 +2335,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args = NULL; struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; int subvol_namelen; int err = 0; @@ -2428,7 +2428,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * anywhere in the filesystem the user wouldn't be able * to delete without an idmapped mount. */ - if (old_dir != dir && mnt_userns != &init_user_ns) { + if (old_dir != dir && idmap != &nop_mnt_idmap) { err = -EOPNOTSUPP; goto free_parent; } @@ -2471,7 +2471,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (err == -EINTR) goto free_subvol_name; - dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen); + dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_unlock_dir; @@ -2513,13 +2513,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (root == dest) goto out_dput; - err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC); + err = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; } /* check if subvolume may be deleted by a user */ - err = btrfs_may_delete(mnt_userns, dir, dentry, 1); + err = btrfs_may_delete(idmap, dir, dentry, 1); if (err) goto out_dput; @@ -2582,7 +2582,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) * running and allows defrag on files open in read-only mode. */ if (!capable(CAP_SYS_ADMIN) && - inode_permission(&init_user_ns, inode, MAY_WRITE)) { + inode_permission(&nop_mnt_idmap, inode, MAY_WRITE)) { ret = -EPERM; goto out; } @@ -3907,7 +3907,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, } static long _btrfs_ioctl_set_received_subvol(struct file *file, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); @@ -3919,7 +3919,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, int ret = 0; int received_uuid_changed; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; ret = mnt_want_write_file(file); @@ -4024,7 +4024,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, args64->rtime.nsec = args32->rtime.nsec; args64->flags = args32->flags; - ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64); + ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), args64); if (ret) goto out; @@ -4058,7 +4058,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, if (IS_ERR(sa)) return PTR_ERR(sa); - ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa); + ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), sa); if (ret) goto out; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 8a855d5ac2fa..d51b9a2f2f6e 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -6,7 +6,7 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int btrfs_fileattr_set(struct user_namespace *mnt_userns, +int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c new file mode 100644 index 000000000000..0fe0ae54ac67 --- /dev/null +++ b/fs/btrfs/lru_cache.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mm.h> +#include "lru_cache.h" +#include "messages.h" + +/* + * Initialize a cache object. + * + * @cache: The cache. + * @max_size: Maximum size (number of entries) for the cache. + * Use 0 for unlimited size, it's the user's responsability to + * trim the cache in that case. + */ +void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) +{ + INIT_LIST_HEAD(&cache->lru_list); + mt_init(&cache->entries); + cache->size = 0; + cache->max_size = max_size; +} + +static struct btrfs_lru_cache_entry *match_entry(struct list_head *head, u64 key, + u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + + list_for_each_entry(entry, head, list) { + if (entry->key == key && entry->gen == gen) + return entry; + } + + return NULL; +} + +/* + * Lookup for an entry in the cache. + * + * @cache: The cache. + * @key: The key of the entry we are looking for. + * @gen: Generation associated to the key. + * + * Returns the entry associated with the key or NULL if none found. + */ +struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, + u64 key, u64 gen) +{ + struct list_head *head; + struct btrfs_lru_cache_entry *entry; + + head = mtree_load(&cache->entries, key); + if (!head) + return NULL; + + entry = match_entry(head, key, gen); + if (entry) + list_move_tail(&entry->lru_list, &cache->lru_list); + + return entry; +} + +/* + * Remove an entry from the cache. + * + * @cache: The cache to remove from. + * @entry: The entry to remove from the cache. + * + * Note: this also frees the memory used by the entry. + */ +void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *entry) +{ + struct list_head *prev = entry->list.prev; + + ASSERT(cache->size > 0); + ASSERT(!mtree_empty(&cache->entries)); + + list_del(&entry->list); + list_del(&entry->lru_list); + + if (list_empty(prev)) { + struct list_head *head; + + /* + * If previous element in the list entry->list is now empty, it + * means it's a head entry not pointing to any cached entries, + * so remove it from the maple tree and free it. + */ + head = mtree_erase(&cache->entries, entry->key); + ASSERT(head == prev); + kfree(head); + } + + kfree(entry); + cache->size--; +} + +/* + * Store an entry in the cache. + * + * @cache: The cache. + * @entry: The entry to store. + * + * Returns 0 on success and < 0 on error. + */ +int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *new_entry, + gfp_t gfp) +{ + const u64 key = new_entry->key; + struct list_head *head; + int ret; + + head = kmalloc(sizeof(*head), gfp); + if (!head) + return -ENOMEM; + + ret = mtree_insert(&cache->entries, key, head, gfp); + if (ret == 0) { + INIT_LIST_HEAD(head); + list_add_tail(&new_entry->list, head); + } else if (ret == -EEXIST) { + kfree(head); + head = mtree_load(&cache->entries, key); + ASSERT(head != NULL); + if (match_entry(head, key, new_entry->gen) != NULL) + return -EEXIST; + list_add_tail(&new_entry->list, head); + } else if (ret < 0) { + kfree(head); + return ret; + } + + if (cache->max_size > 0 && cache->size == cache->max_size) { + struct btrfs_lru_cache_entry *lru_entry; + + lru_entry = list_first_entry(&cache->lru_list, + struct btrfs_lru_cache_entry, + lru_list); + btrfs_lru_cache_remove(cache, lru_entry); + } + + list_add_tail(&new_entry->lru_list, &cache->lru_list); + cache->size++; + + return 0; +} + +/* + * Empty a cache. + * + * @cache: The cache to empty. + * + * Removes all entries from the cache. + */ +void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache) +{ + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; + + list_for_each_entry_safe(entry, tmp, &cache->lru_list, lru_list) + btrfs_lru_cache_remove(cache, entry); + + ASSERT(cache->size == 0); + ASSERT(mtree_empty(&cache->entries)); +} diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h new file mode 100644 index 000000000000..de3e18bce24a --- /dev/null +++ b/fs/btrfs/lru_cache.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_LRU_CACHE_H +#define BTRFS_LRU_CACHE_H + +#include <linux/maple_tree.h> +#include <linux/list.h> + +/* + * A cache entry. This is meant to be embedded in a structure of a user of + * this module. Similar to how struct list_head and struct rb_node are used. + * + * Note: it should be embedded as the first element in a struct (offset 0), and + * this module assumes it was allocated with kmalloc(), so it calls kfree() when + * it needs to free an entry. + */ +struct btrfs_lru_cache_entry { + struct list_head lru_list; + u64 key; + /* + * Optional generation associated to a key. Use 0 if not needed/used. + * Entries with the same key and different generations are stored in a + * linked list, so use this only for cases where there's a small number + * of different generations. + */ + u64 gen; + /* + * The maple tree uses unsigned long type for the keys, which is 32 bits + * on 32 bits systems, and 64 bits on 64 bits systems. So if we want to + * use something like inode numbers as keys, which are always a u64, we + * have to deal with this in a special way - we store the key in the + * entry itself, as a u64, and the values inserted into the maple tree + * are linked lists of entries - so in case we are on a 64 bits system, + * that list always has a single entry, while on 32 bits systems it + * may have more than one, with each entry having the same value for + * their lower 32 bits of the u64 key. + */ + struct list_head list; +}; + +struct btrfs_lru_cache { + struct list_head lru_list; + struct maple_tree entries; + /* Number of entries stored in the cache. */ + unsigned int size; + /* Maximum number of entries the cache can have. */ + unsigned int max_size; +}; + +#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ + list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) + +static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) +{ + return cache->size; +} + +static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache) +{ + return cache->size >= cache->max_size; +} + +static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( + struct btrfs_lru_cache *cache) +{ + return list_first_entry_or_null(&cache->lru_list, + struct btrfs_lru_cache_entry, lru_list); +} + +void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size); +struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, + u64 key, u64 gen); +int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *new_entry, + gfp_t gfp); +void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, + struct btrfs_lru_cache_entry *entry); +void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache); + +#endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d5e78cbc8fbc..71f6d8302d50 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -280,7 +280,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } /* Check if we have reached page boundary */ - if (IS_ALIGNED(cur_in, PAGE_SIZE)) { + if (PAGE_ALIGNED(cur_in)) { put_page(page_in); page_in = NULL; } diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 625bbbbb2608..fde5aaa6e7c9 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -293,36 +293,6 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) #endif /* - * We only mark the transaction aborted and then set the file system read-only. - * This will prevent new transactions from starting or trying to join this - * one. - * - * This means that error recovery at the call site is limited to freeing - * any local memory allocations and passing the error code up without - * further cleanup. The transaction should complete as it normally would - * in the call path but will return -EIO. - * - * We'll complete the cleanup in btrfs_end_transaction and - * btrfs_commit_transaction. - */ -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) - btrfs_dump_space_info_for_trans_abort(fs_info); - /* Wake up anybody who may be waiting on this transaction */ - wake_up(&fs_info->transaction_wait); - wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); -} - -/* * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an * alert, and either panics or BUGs, depending on mount options. */ diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 190af1f698d9..8c516ee58ff9 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -6,7 +6,6 @@ #include <linux/types.h> struct btrfs_fs_info; -struct btrfs_trans_handle; static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) @@ -178,39 +177,6 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function const char * __attribute_const__ btrfs_decode_error(int errno); -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit); - -bool __cold abort_should_print_stack(int errno); - -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact stack trace is reported for some errors. - */ -#define btrfs_abort_transaction(trans, errno) \ -do { \ - bool first = false; \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((trans)->fs_info->fs_state))) { \ - first = true; \ - if (WARN(abort_should_print_stack(errno), \ - KERN_ERR \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ - /* Stack trace printed. */ \ - } else { \ - btrfs_err((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ - } \ - } \ - __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ -} while (0) - #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 57d8c72737e1..6c24b69e2d0a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -616,7 +616,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); complete(&ordered->completion); } @@ -716,13 +716,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, } /* - * Used to start IO or wait for a given ordered extent to finish. + * Start IO and wait for a given ordered extent to finish. * - * If wait is one, this effectively waits on page writeback for all the pages - * in the extent, and it waits on the io completion code to insert - * metadata into the btree corresponding to the extent + * Wait on page writeback for all the pages in the extent and the IO completion + * code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -744,12 +743,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); - if (wait) { - if (!freespace_inode) - btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); - wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, - &entry->flags)); - } + + if (!freespace_inode) + btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); } /* @@ -800,7 +797,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -1061,7 +1058,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(ordered, 1); + btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 89f82b78f590..eb40cb39f842 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -157,7 +157,6 @@ struct btrfs_ordered_extent { * command in a workqueue context */ u64 physical; - struct block_device *bdev; }; static inline void @@ -187,7 +186,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index af97413abcf4..52a7d2fa2284 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1304,7 +1304,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - btrfs_clean_tree_block(quota_root->node); + btrfs_clear_buffer_dirty(trans, quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, btrfs_root_id(quota_root), quota_root->node, 0, 1); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6a2cf754912d..642828c1b299 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -998,7 +998,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) } /* - * Return the total numer of errors found in the vertical stripe of @sector_nr. + * Return the total number of errors found in the vertical stripe of @sector_nr. * * @faila and @failb will also be updated to the first and second stripe * number of the errors. @@ -1183,7 +1183,15 @@ not_found: trace_info->stripe_nr = -1; } -/* Generate PQ for one veritical stripe. */ +static inline void bio_list_put(struct bio_list *bio_list) +{ + struct bio *bio; + + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); +} + +/* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { void **pointers = rbio->finish_pointers; @@ -1228,7 +1236,6 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { - struct bio *bio; /* The total sector number inside the full stripe. */ int total_sector_nr; int sectornr; @@ -1317,8 +1324,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, return 0; error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); + bio_list_put(bio_list); return -EIO; } @@ -1357,7 +1363,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) } /* - * For subpage case, we can no longer set page Uptodate directly for + * For subpage case, we can no longer set page Up-to-date directly for * stripe_pages[], thus we need to locate the sector. */ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, @@ -1425,13 +1431,20 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi int total_sector_nr = get_bio_sector_nr(rbio, bio); u32 bio_size = 0; struct bio_vec *bvec; - struct bvec_iter_all iter_all; + int i; - bio_for_each_segment_all(bvec, bio, iter_all) + bio_for_each_bvec_all(bvec, bio, i) bio_size += bvec->bv_len; - bitmap_set(rbio->error_bitmap, total_sector_nr, - bio_size >> rbio->bioc->fs_info->sectorsize_bits); + /* + * Since we can have multiple bios touching the error_bitmap, we cannot + * call bitmap_set() without protection. + * + * Instead use set_bit() for each bit, as set_bit() itself is atomic. + */ + for (i = total_sector_nr; i < total_sector_nr + + (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) + set_bit(i, rbio->error_bitmap); } /* Verify the data sectors at read time. */ @@ -1490,7 +1503,7 @@ static void raid_wait_read_end_io(struct bio *bio) wake_up(&rbio->io_wait); } -static void submit_read_bios(struct btrfs_raid_bio *rbio, +static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { struct bio *bio; @@ -1507,41 +1520,8 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio, } submit_bio(bio); } -} - -static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) -{ - struct bio *bio; - int total_sector_nr; - int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); - - /* - * Build a list of bios to read all sectors (including data and P/Q). - * - * This behaviro is to compensate the later csum verification and - * recovery. - */ - for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; - total_sector_nr++) { - struct sector_ptr *sector; - int stripe = total_sector_nr / rbio->stripe_nsectors; - int sectornr = total_sector_nr % rbio->stripe_nsectors; - - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, bio_list, sector, - stripe, sectornr, REQ_OP_READ); - if (ret) - goto cleanup; - } - return 0; - -cleanup: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - return ret; + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); } static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) @@ -1660,12 +1640,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret = 0; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - ret = PTR_ERR(rbio); - goto fail; + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + bio_endio(bio); + return; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); @@ -1674,31 +1654,24 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) * Don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) - goto queue_rbio; - - cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); - if (cb) { - plug = container_of(cb, struct btrfs_plug_cb, cb); - if (!plug->info) { - plug->info = fs_info; - INIT_LIST_HEAD(&plug->rbio_list); + if (!rbio_is_full(rbio)) { + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + return; } - list_add_tail(&rbio->plug_list, &plug->rbio_list); - return; } -queue_rbio: + /* * Either we don't have any existing plug, or we're doing a full stripe, - * can queue the rmw work now. + * queue the rmw work now. */ start_async_work(rbio, rmw_rbio_work); - - return; - -fail: - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); } static int verify_one_sector(struct btrfs_raid_bio *rbio, @@ -1765,7 +1738,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); /* - * No errors in the veritical stripe, skip it. Can happen for recovery + * No errors in the vertical stripe, skip it. Can happen for recovery * which only part of a stripe failed csum check. */ if (!found_errors) @@ -1886,7 +1859,7 @@ pstripe: sector->uptodate = 1; } if (failb >= 0) { - ret = verify_one_sector(rbio, faila, sector_nr); + ret = verify_one_sector(rbio, failb, sector_nr); if (ret < 0) goto cleanup; @@ -1941,14 +1914,25 @@ out: return ret; } -static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) +static void recover_rbio(struct btrfs_raid_bio *rbio) { - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); + /* + * Either we're doing recover for a read failure or degraded write, + * caller should have set error bitmap correctly. + */ + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); + + /* For recovery, we need to read all sectors including P/Q. */ + ret = alloc_rbio_pages(rbio); + if (ret < 0) + goto out; + + index_rbio_pages(rbio); + /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. @@ -1979,78 +1963,32 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, } sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); - if (ret < 0) - goto error; + if (ret < 0) { + bio_list_put(&bio_list); + goto out; + } } - return 0; -error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - - return -EIO; -} - -static int recover_rbio(struct btrfs_raid_bio *rbio) -{ - struct bio_list bio_list; - struct bio *bio; - int ret; - - /* - * Either we're doing recover for a read failure or degraded write, - * caller should have set error bitmap correctly. - */ - ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); - bio_list_init(&bio_list); - - /* For recovery, we need to read all sectors including P/Q. */ - ret = alloc_rbio_pages(rbio); - if (ret < 0) - goto out; - - index_rbio_pages(rbio); - - ret = recover_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto out; - - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + submit_read_wait_bio_list(rbio, &bio_list); ret = recover_sectors(rbio); - out: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void recover_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; - int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = lock_stripe_add(rbio); - if (ret == 0) { - ret = recover_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } + if (!lock_stripe_add(rbio)) + recover_rbio(rbio); } static void recover_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = recover_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + recover_rbio(container_of(work, struct btrfs_raid_bio, work)); } static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) @@ -2196,11 +2134,9 @@ no_csum: static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) { - struct bio_list bio_list; - struct bio *bio; - int ret; - - bio_list_init(&bio_list); + struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; /* * Fill the data csums we need for data verification. We need to fill @@ -2209,24 +2145,32 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) */ fill_data_csums(rbio); - ret = rmw_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto out; + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behavior is to compensate the later csum verification and recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) { + bio_list_put(&bio_list); + return ret; + } + } /* * We may or may not have any corrupted sectors (including missing dev * and csum mismatch), just let recover_sectors() to handle them all. */ - ret = recover_sectors(rbio); - return ret; -out: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; + submit_read_wait_bio_list(rbio, &bio_list); + return recover_sectors(rbio); } static void raid_wait_write_end_io(struct bio *bio) @@ -2282,7 +2226,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) return false; } -static int rmw_rbio(struct btrfs_raid_bio *rbio) +static void rmw_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; int sectornr; @@ -2294,30 +2238,28 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) */ ret = alloc_rbio_parity_pages(rbio); if (ret < 0) - return ret; + goto out; /* * Either full stripe write, or we have every data sector already * cached, can go to write path immediately. */ - if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) - goto write; - - /* - * Now we're doing sub-stripe write, also need all data stripes to do - * the full RMW. - */ - ret = alloc_rbio_data_pages(rbio); - if (ret < 0) - return ret; + if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { + /* + * Now we're doing sub-stripe write, also need all data stripes + * to do the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + goto out; - index_rbio_pages(rbio); + index_rbio_pages(rbio); - ret = rmw_read_wait_recover(rbio); - if (ret < 0) - return ret; + ret = rmw_read_wait_recover(rbio); + if (ret < 0) + goto out; + } -write: /* * At this stage we're not allowed to add any new bios to the * bio list any more, anyone else that wants to change this stripe @@ -2348,7 +2290,7 @@ write: bio_list_init(&bio_list); ret = rmw_assemble_write_bios(rbio, &bio_list); if (ret < 0) - return ret; + goto out; /* We should have at least one bio assembled. */ ASSERT(bio_list_size(&bio_list)); @@ -2365,32 +2307,22 @@ write: break; } } - return ret; +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; - int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = lock_stripe_add(rbio); - if (ret == 0) { - ret = rmw_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } + if (lock_stripe_add(rbio) == 0) + rmw_rbio(rbio); } static void rmw_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - - ret = rmw_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); } /* @@ -2498,7 +2430,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) struct sector_ptr p_sector = { 0 }; struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; - struct bio *bio; int is_replace = 0; int ret; @@ -2629,8 +2560,7 @@ submit_write: return 0; cleanup: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); + bio_list_put(&bio_list); return ret; } @@ -2725,15 +2655,12 @@ out: return ret; } -static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) { - struct bio *bio; + struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; - ASSERT(bio_list_size(bio_list) == 0); - /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { @@ -2762,45 +2689,38 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, if (sector->uptodate) continue; - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); - if (ret) - goto error; + if (ret) { + bio_list_put(&bio_list); + return ret; + } } + + submit_read_wait_bio_list(rbio, &bio_list); return 0; -error: - while ((bio = bio_list_pop(bio_list))) - bio_put(bio); - return ret; } -static int scrub_rbio(struct btrfs_raid_bio *rbio) +static void scrub_rbio(struct btrfs_raid_bio *rbio) { bool need_check = false; - struct bio_list bio_list; int sector_nr; int ret; - struct bio *bio; - - bio_list_init(&bio_list); ret = alloc_rbio_essential_pages(rbio); if (ret) - goto cleanup; + goto out; bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); - ret = scrub_assemble_read_bios(rbio, &bio_list); + ret = scrub_assemble_read_bios(rbio); if (ret < 0) - goto cleanup; - - submit_read_bios(rbio, &bio_list); - wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + goto out; /* We may have some failures, recover the failed sectors first. */ ret = recover_scrub_rbio(rbio); if (ret < 0) - goto cleanup; + goto out; /* * We have every sector properly prepared. Can finish the scrub @@ -2817,23 +2737,13 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) break; } } - return ret; - -cleanup: - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return ret; +out: + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void scrub_rbio_work_locked(struct work_struct *work) { - struct btrfs_raid_bio *rbio; - int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); - ret = scrub_rbio(rbio); - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 7c73a443939e..df0e0abdeb1f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -65,7 +65,7 @@ struct btrfs_raid_bio { /* Number of data stripes (no p/q) */ u8 nr_data; - /* Numer of all stripes (including P/Q) */ + /* Number of all stripes (including P/Q) */ u8 real_stripes; /* How many pages there are for each stripe */ @@ -132,7 +132,7 @@ struct btrfs_raid_bio { /* * Checksum buffer if the rbio is for data. The buffer should cover - * all data sectors (exlcuding P/Q sectors). + * all data sectors (excluding P/Q sectors). */ u8 *csum_buf; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 31ec4a7658ce..ef13a9d4e370 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2825,7 +2825,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * * Here we have to manually invalidate the range (i_size, PAGE_END + 1). */ - if (!IS_ALIGNED(i_size, PAGE_SIZE)) { + if (!PAGE_ALIGNED(i_size)) { struct address_space *mapping = inode->vfs_inode.i_mapping; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 52b346795f66..69c93ae333f6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -229,7 +229,7 @@ struct full_stripe_lock { }; #ifndef CONFIG_64BIT -/* This structure is for archtectures whose (void *) is smaller than u64 */ +/* This structure is for architectures whose (void *) is smaller than u64 */ struct scrub_page_private { u64 logical; }; @@ -2053,20 +2053,33 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (sblock->logical != btrfs_stack_header_bytenr(h)) + if (sblock->logical != btrfs_stack_header_bytenr(h)) { sblock->header_error = 1; - - if (sector->generation != btrfs_stack_header_generation(h)) { - sblock->header_error = 1; - sblock->generation_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + sblock->logical, sblock->mirror_num, + btrfs_stack_header_bytenr(h), + sblock->logical); + goto out; } - if (!scrub_check_fsid(h->fsid, sector)) + if (!scrub_check_fsid(h->fsid, sector)) { sblock->header_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad fsid, has %pU want %pU", + sblock->logical, sblock->mirror_num, + h->fsid, sblock->dev->fs_devices->fsid); + goto out; + } - if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) + if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { sblock->header_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + sblock->logical, sblock->mirror_num, + h->chunk_tree_uuid, fs_info->chunk_tree_uuid); + goto out; + } shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); @@ -2079,9 +2092,27 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) } crypto_shash_final(shash, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) { sblock->checksum_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, + sblock->logical, sblock->mirror_num, + CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), + CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); + goto out; + } + + if (sector->generation != btrfs_stack_header_generation(h)) { + sblock->header_error = 1; + sblock->generation_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad generation, has %llu want %llu", + sblock->logical, sblock->mirror_num, + btrfs_stack_header_generation(h), + sector->generation); + } +out: return sblock->header_error || sblock->checksum_error; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e65e6b6600a7..e5c963bb873d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -32,6 +32,7 @@ #include "file-item.h" #include "ioctl.h" #include "verity.h" +#include "lru_cache.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -80,23 +81,23 @@ struct clone_root { bool found_ref; }; -#define SEND_CTX_MAX_NAME_CACHE_SIZE 128 -#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +#define SEND_MAX_NAME_CACHE_SIZE 256 /* - * Limit the root_ids array of struct backref_cache_entry to 12 elements. - * This makes the size of a cache entry to be exactly 128 bytes on x86_64. + * Limit the root_ids array of struct backref_cache_entry to 17 elements. + * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which + * can be satisfied from the kmalloc-192 slab, without wasting any space. * The most common case is to have a single root for cloning, which corresponds - * to the send root. Having the user specify more than 11 clone roots is not + * to the send root. Having the user specify more than 16 clone roots is not * common, and in such rare cases we simply don't use caching if the number of - * cloning roots that lead down to a leaf is more than 12. + * cloning roots that lead down to a leaf is more than 17. */ -#define SEND_MAX_BACKREF_CACHE_ROOTS 12 +#define SEND_MAX_BACKREF_CACHE_ROOTS 17 /* * Max number of entries in the cache. - * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding - * maple tree's internal nodes, is 16K. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding + * maple tree's internal nodes, is 24K. */ #define SEND_MAX_BACKREF_CACHE_SIZE 128 @@ -107,15 +108,31 @@ struct clone_root { * x86_64). */ struct backref_cache_entry { - /* List to link to the cache's lru list. */ - struct list_head list; - /* The key for this entry in the cache. */ - u64 key; + struct btrfs_lru_cache_entry entry; u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; /* Number of valid elements in the root_ids array. */ int num_roots; }; +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct backref_cache_entry, entry) == 0); + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64 + +/* + * Max number of entries in the cache that stores directories that were already + * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses + * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but + * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). + */ +#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64 + struct send_ctx { struct file *send_filp; loff_t send_off; @@ -174,9 +191,7 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct radix_tree_root name_cache; - struct list_head name_cache_list; - int name_cache_size; + struct btrfs_lru_cache name_cache; /* * The inode we are currently processing. It's not NULL only when we @@ -285,13 +300,11 @@ struct send_ctx { struct rb_root rbtree_new_refs; struct rb_root rbtree_deleted_refs; - struct { - u64 last_reloc_trans; - struct list_head lru_list; - struct maple_tree entries; - /* Number of entries stored in the cache. */ - int size; - } backref_cache; + struct btrfs_lru_cache backref_cache; + u64 backref_cache_last_reloc_trans; + + struct btrfs_lru_cache dir_created_cache; + struct btrfs_lru_cache dir_utimes_cache; }; struct pending_dir_move { @@ -321,21 +334,15 @@ struct orphan_dir_info { u64 ino; u64 gen; u64 last_dir_index_offset; + u64 dir_high_seq_ino; }; struct name_cache_entry { - struct list_head list; /* - * radix_tree has only 32bit entries but we need to handle 64bit inums. - * We use the lower 32bit of the 64bit inum to store it in the tree. If - * more then one inum would fall into the same entry, we use radix_list - * to store the additional entries. radix_list is also used to store - * entries where two entries have the same inum but different - * generations. + * The key in the entry is an inode number, and the generation matches + * the inode's generation. */ - struct list_head radix_list; - u64 ino; - u64 gen; + struct btrfs_lru_cache_entry entry; u64 parent_ino; u64 parent_gen; int ret; @@ -344,6 +351,9 @@ struct name_cache_entry { char name[]; }; +/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ +static_assert(offsetof(struct name_cache_entry, entry) == 0); + #define ADVANCE 1 #define ADVANCE_ONLY_NEXT -1 @@ -956,14 +966,12 @@ out: static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) { int ret; - struct btrfs_inode_info info; + struct btrfs_inode_info info = { 0 }; - if (!gen) - return -EPERM; + ASSERT(gen); ret = get_inode_info(root, ino, &info); - if (!ret) - *gen = info.gen; + *gen = info.gen; return ret; } @@ -1388,19 +1396,6 @@ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, return 0; } -static void empty_backref_cache(struct send_ctx *sctx) -{ - struct backref_cache_entry *entry; - struct backref_cache_entry *tmp; - - list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) - kfree(entry); - - INIT_LIST_HEAD(&sctx->backref_cache.lru_list); - mtree_destroy(&sctx->backref_cache.entries); - sctx->backref_cache.size = 0; -} - static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, const u64 **root_ids_ret, int *root_count_ret) { @@ -1408,9 +1403,10 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; - if (sctx->backref_cache.size == 0) + if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) return false; /* @@ -1424,18 +1420,18 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { - empty_backref_cache(sctx); + if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) { + btrfs_lru_cache_clear(&sctx->backref_cache); return false; } - entry = mtree_load(&sctx->backref_cache.entries, key); - if (!entry) + raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0); + if (!raw_entry) return false; + entry = container_of(raw_entry, struct backref_cache_entry, entry); *root_ids_ret = entry->root_ids; *root_count_ret = entry->num_roots; - list_move_tail(&entry->list, &sctx->backref_cache.lru_list); return true; } @@ -1461,7 +1457,8 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, if (!new_entry) return; - new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.gen = 0; new_entry->num_roots = 0; ULIST_ITER_INIT(&uiter); while ((node = ulist_next(root_ids, &uiter)) != NULL) { @@ -1489,23 +1486,12 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, * none of the roots is part of the list of roots from which we are * allowed to clone. Cache the new entry as it's still useful to avoid * backref walking to determine which roots have a path to the leaf. + * + * Also use GFP_NOFS because we're called while holding a transaction + * handle or while holding fs_info->commit_root_sem. */ - - if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { - struct backref_cache_entry *lru_entry; - struct backref_cache_entry *mt_entry; - - lru_entry = list_first_entry(&sctx->backref_cache.lru_list, - struct backref_cache_entry, list); - mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); - ASSERT(mt_entry == lru_entry); - list_del(&mt_entry->list); - kfree(mt_entry); - sctx->backref_cache.size--; - } - - ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, - new_entry, GFP_NOFS); + ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry, + GFP_NOFS); ASSERT(ret == 0 || ret == -ENOMEM); if (ret) { /* Caching is optional, no worries. */ @@ -1513,17 +1499,13 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, return; } - list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); - /* * We are called from iterate_extent_inodes() while either holding a * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (sctx->backref_cache.size == 0) - sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; - - sctx->backref_cache.size++; + if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) + sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; } static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, @@ -1886,7 +1868,8 @@ enum inode_state { inode_state_did_delete, }; -static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) +static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) { int ret; int left_ret; @@ -1900,6 +1883,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) goto out; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; + if (send_gen) + *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen); if (!sctx->parent_root) { right_ret = -ENOENT; @@ -1909,6 +1894,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) goto out; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; + if (parent_gen) + *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen); } if (!left_ret && !right_ret) { @@ -1953,14 +1940,15 @@ out: return ret; } -static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) +static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, + u64 *send_gen, u64 *parent_gen) { int ret; if (ino == BTRFS_FIRST_FREE_OBJECTID) return 1; - ret = get_cur_inode_state(sctx, ino, gen); + ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) goto out; @@ -2121,43 +2109,36 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, const char *name, int name_len, u64 *who_ino, u64 *who_gen, u64 *who_mode) { - int ret = 0; - u64 gen; + int ret; + u64 parent_root_dir_gen; u64 other_inode = 0; struct btrfs_inode_info info; if (!sctx->parent_root) - goto out; + return 0; - ret = is_inode_existent(sctx, dir, dir_gen); + ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen); if (ret <= 0) - goto out; + return 0; /* * If we have a parent root we need to verify that the parent dir was * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. + * + * @parent_root_dir_gen was set to 0 if the inode does not exist in the + * parent root. */ - if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_gen(sctx->parent_root, dir, &gen); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } - if (gen != dir_gen) - goto out; - } + if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID && + parent_root_dir_gen != dir_gen) + return 0; ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, &other_inode); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } + if (ret == -ENOENT) + return 0; + else if (ret < 0) + return ret; /* * Check if the overwritten ref was already processed. If yes, the ref @@ -2168,18 +2149,15 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, &info); if (ret < 0) - goto out; + return ret; - ret = 1; *who_ino = other_inode; *who_gen = info.gen; *who_mode = info.mode; - } else { - ret = 0; + return 1; } -out: - return ret; + return 0; } /* @@ -2194,47 +2172,43 @@ static int did_overwrite_ref(struct send_ctx *sctx, u64 ino, u64 ino_gen, const char *name, int name_len) { - int ret = 0; - u64 gen; + int ret; u64 ow_inode; + u64 ow_gen = 0; + u64 send_root_dir_gen; if (!sctx->parent_root) - goto out; + return 0; - ret = is_inode_existent(sctx, dir, dir_gen); + ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL); if (ret <= 0) - goto out; + return ret; - if (dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_gen(sctx->send_root, dir, &gen); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { - ret = 0; - goto out; - } - if (gen != dir_gen) - goto out; - } + /* + * @send_root_dir_gen was set to 0 if the inode does not exist in the + * send root. + */ + if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen) + return 0; /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, &ow_inode); - if (ret < 0 && ret != -ENOENT) - goto out; - if (ret) { + if (ret == -ENOENT) { /* was never and will never be overwritten */ - ret = 0; - goto out; + return 0; + } else if (ret < 0) { + return ret; } - ret = get_inode_gen(sctx->send_root, ow_inode, &gen); - if (ret < 0) - goto out; + if (ow_inode == ino) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; - if (ow_inode == ino && gen == ino_gen) { - ret = 0; - goto out; + /* It's the same inode, so no overwrite happened. */ + if (ow_gen == ino_gen) + return 0; } /* @@ -2243,15 +2217,20 @@ static int did_overwrite_ref(struct send_ctx *sctx, * inode 'ino' to be orphanized, therefore check if ow_inode matches * the current inode being processed. */ - if ((ow_inode < sctx->send_progress) || - (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && - gen == sctx->cur_inode_gen)) - ret = 1; - else - ret = 0; + if (ow_inode < sctx->send_progress) + return 1; -out: - return ret; + if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) { + if (ow_gen == 0) { + ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); + if (ret < 0) + return ret; + } + if (ow_gen == sctx->cur_inode_gen) + return 1; + } + + return 0; } /* @@ -2285,113 +2264,16 @@ out: return ret; } -/* - * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, - * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::radix_list. - * In case of error, nce is kfreed. - */ -static int name_cache_insert(struct send_ctx *sctx, - struct name_cache_entry *nce) +static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx, + u64 ino, u64 gen) { - int ret = 0; - struct list_head *nce_head; - - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); - if (!nce_head) { - nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); - if (!nce_head) { - kfree(nce); - return -ENOMEM; - } - INIT_LIST_HEAD(nce_head); - - ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); - if (ret < 0) { - kfree(nce_head); - kfree(nce); - return ret; - } - } - list_add_tail(&nce->radix_list, nce_head); - list_add_tail(&nce->list, &sctx->name_cache_list); - sctx->name_cache_size++; - - return ret; -} + struct btrfs_lru_cache_entry *entry; -static void name_cache_delete(struct send_ctx *sctx, - struct name_cache_entry *nce) -{ - struct list_head *nce_head; - - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); - if (!nce_head) { - btrfs_err(sctx->send_root->fs_info, - "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", - nce->ino, sctx->name_cache_size); - } - - list_del(&nce->radix_list); - list_del(&nce->list); - sctx->name_cache_size--; - - /* - * We may not get to the final release of nce_head if the lookup fails - */ - if (nce_head && list_empty(nce_head)) { - radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); - kfree(nce_head); - } -} - -static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, - u64 ino, u64 gen) -{ - struct list_head *nce_head; - struct name_cache_entry *cur; - - nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); - if (!nce_head) + entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen); + if (!entry) return NULL; - list_for_each_entry(cur, nce_head, radix_list) { - if (cur->ino == ino && cur->gen == gen) - return cur; - } - return NULL; -} - -/* - * Remove some entries from the beginning of name_cache_list. - */ -static void name_cache_clean_unused(struct send_ctx *sctx) -{ - struct name_cache_entry *nce; - - if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) - return; - - while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { - nce = list_entry(sctx->name_cache_list.next, - struct name_cache_entry, list); - name_cache_delete(sctx, nce); - kfree(nce); - } -} - -static void name_cache_free(struct send_ctx *sctx) -{ - struct name_cache_entry *nce; - - while (!list_empty(&sctx->name_cache_list)) { - nce = list_entry(sctx->name_cache_list.next, - struct name_cache_entry, list); - name_cache_delete(sctx, nce); - kfree(nce); - } + return container_of(entry, struct name_cache_entry, entry); } /* @@ -2410,7 +2292,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, { int ret; int nce_ret; - struct name_cache_entry *nce = NULL; + struct name_cache_entry *nce; /* * First check if we already did a call to this function with the same @@ -2420,17 +2302,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, nce = name_cache_search(sctx, ino, gen); if (nce) { if (ino < sctx->send_progress && nce->need_later_update) { - name_cache_delete(sctx, nce); - kfree(nce); + btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry); nce = NULL; } else { - /* - * Removes the entry from the list and adds it back to - * the end. This marks the entry as recently used so - * that name_cache_clean_unused does not remove it. - */ - list_move_tail(&nce->list, &sctx->name_cache_list); - *parent_ino = nce->parent_ino; *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); @@ -2446,7 +2320,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, * This should only happen for the parent dir that we determine in * record_new_ref_if_needed(). */ - ret = is_inode_existent(sctx, ino, gen); + ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) goto out; @@ -2497,8 +2371,8 @@ out_cache: goto out; } - nce->ino = ino; - nce->gen = gen; + nce->entry.key = ino; + nce->entry.gen = gen; nce->parent_ino = *parent_ino; nce->parent_gen = *parent_gen; nce->name_len = fs_path_len(dest); @@ -2510,10 +2384,11 @@ out_cache: else nce->need_later_update = 1; - nce_ret = name_cache_insert(sctx, nce); - if (nce_ret < 0) + nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); + if (nce_ret < 0) { + kfree(nce); ret = nce_ret; - name_cache_clean_unused(sctx); + } out: return ret; @@ -2884,6 +2759,63 @@ out: } /* + * If the cache is full, we can't remove entries from it and do a call to + * send_utimes() for each respective inode, because we might be finishing + * processing an inode that is a directory and it just got renamed, and existing + * entries in the cache may refer to inodes that have the directory in their + * full path - in which case we would generate outdated paths (pre-rename) + * for the inodes that the cache entries point to. Instead of prunning the + * cache when inserting, do it after we finish processing each inode at + * finish_inode_if_needed(). + */ +static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen); + if (entry != NULL) + return 0; + + /* Caching is optional, don't fail if we can't allocate memory. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return send_utimes(sctx, dir, gen); + + entry->key = dir; + entry->gen = gen; + + ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL); + ASSERT(ret != -EEXIST); + if (ret) { + kfree(entry); + return send_utimes(sctx, dir, gen); + } + + return 0; +} + +static int trim_dir_utimes_cache(struct send_ctx *sctx) +{ + while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > + SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + struct btrfs_lru_cache_entry *lru; + int ret; + + lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache); + ASSERT(lru != NULL); + + ret = send_utimes(sctx, lru->key, lru->gen); + if (ret) + return ret; + + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru); + } + + return 0; +} + +/* * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have * a valid path yet because we did not process the refs yet. So, the inode * is created as orphan. @@ -2971,6 +2903,23 @@ out: return ret; } +static void cache_dir_created(struct send_ctx *sctx, u64 dir) +{ + struct btrfs_lru_cache_entry *entry; + int ret; + + /* Caching is optional, ignore any failures. */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return; + + entry->key = dir; + entry->gen = 0; + ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL); + if (ret < 0) + kfree(entry); +} + /* * We need some special handling for inodes that get processed before the parent * directory got created. See process_recorded_refs for details. @@ -2986,6 +2935,9 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) struct btrfs_key di_key; struct btrfs_dir_item *di; + if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0)) + return 1; + path = alloc_path_for_send(); if (!path) return -ENOMEM; @@ -3009,6 +2961,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) if (di_key.type != BTRFS_ROOT_ITEM_KEY && di_key.objectid < sctx->send_progress) { ret = 1; + cache_dir_created(sctx, dir); break; } } @@ -3038,7 +2991,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx) return 0; } - return send_create_inode(sctx, sctx->cur_ino); + ret = send_create_inode(sctx, sctx->cur_ino); + + if (ret == 0 && S_ISDIR(sctx->cur_inode_mode)) + cache_dir_created(sctx, sctx->cur_ino); + + return ret; } struct recorded_ref { @@ -3166,6 +3124,7 @@ static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, odi->ino = dir_ino; odi->gen = dir_gen; odi->last_dir_index_offset = 0; + odi->dir_high_seq_ino = 0; rb_link_node(&odi->node, parent, p); rb_insert_color(&odi->node, &sctx->orphan_dirs); @@ -3215,8 +3174,7 @@ static void free_orphan_dir_info(struct send_ctx *sctx, * We check this by iterating all dir items and checking if the inode behind * the dir item was already processed. */ -static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, - u64 send_progress) +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen) { int ret = 0; int iter_ret = 0; @@ -3227,6 +3185,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_key loc; struct btrfs_dir_item *di; struct orphan_dir_info *odi = NULL; + u64 dir_high_seq_ino = 0; + u64 last_dir_index_offset = 0; /* * Don't try to rmdir the top/root subvolume dir. @@ -3234,17 +3194,62 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (dir == BTRFS_FIRST_FREE_OBJECTID) return 0; + odi = get_orphan_dir_info(sctx, dir, dir_gen); + if (odi && sctx->cur_ino < odi->dir_high_seq_ino) + return 0; + path = alloc_path_for_send(); if (!path) return -ENOMEM; + if (!odi) { + /* + * Find the inode number associated with the last dir index + * entry. This is very likely the inode with the highest number + * of all inodes that have an entry in the directory. We can + * then use it to avoid future calls to can_rmdir(), when + * processing inodes with a lower number, from having to search + * the parent root b+tree for dir index keys. + */ + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + /* Can't happen, the root is never empty. */ + ASSERT(path->slots[0] > 0); + if (WARN_ON(path->slots[0] == 0)) { + ret = -EUCLEAN; + goto out; + } + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) { + /* No index keys, dir can be removed. */ + ret = 1; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = loc.objectid; + if (sctx->cur_ino < dir_high_seq_ino) { + ret = 0; + goto out; + } + + btrfs_release_path(path); + } + key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; - key.offset = 0; - - odi = get_orphan_dir_info(sctx, dir, dir_gen); - if (odi) - key.offset = odi->last_dir_index_offset; + key.offset = (odi ? odi->last_dir_index_offset : 0); btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct waiting_dir_move *dm; @@ -3257,29 +3262,18 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid); + last_dir_index_offset = found_key.offset; + dm = get_waiting_dir_move(sctx, loc.objectid); if (dm) { - odi = add_orphan_dir_info(sctx, dir, dir_gen); - if (IS_ERR(odi)) { - ret = PTR_ERR(odi); - goto out; - } - odi->gen = dir_gen; - odi->last_dir_index_offset = found_key.offset; dm->rmdir_ino = dir; dm->rmdir_gen = dir_gen; ret = 0; goto out; } - if (loc.objectid > send_progress) { - odi = add_orphan_dir_info(sctx, dir, dir_gen); - if (IS_ERR(odi)) { - ret = PTR_ERR(odi); - goto out; - } - odi->gen = dir_gen; - odi->last_dir_index_offset = found_key.offset; + if (loc.objectid > sctx->cur_ino) { ret = 0; goto out; } @@ -3294,7 +3288,22 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, out: btrfs_free_path(path); - return ret; + + if (ret) + return ret; + + if (!odi) { + odi = add_orphan_dir_info(sctx, dir, dir_gen); + if (IS_ERR(odi)) + return PTR_ERR(odi); + + odi->gen = dir_gen; + } + + odi->last_dir_index_offset = last_dir_index_offset; + odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino); + + return 0; } static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) @@ -3579,7 +3588,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } gen = odi->gen; - ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino); + ret = can_rmdir(sctx, rmdir_ino, gen); if (ret < 0) goto out; if (!ret) @@ -3599,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } finish: - ret = send_utimes(sctx, pm->ino, pm->gen); + ret = cache_dir_utimes(sctx, pm->ino, pm->gen); if (ret < 0) goto out; @@ -3619,7 +3628,7 @@ finish: if (ret < 0) goto out; - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } @@ -4242,7 +4251,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * "testdir_2". */ list_for_each_entry(cur, &sctx->new_refs, list) { - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) @@ -4288,12 +4297,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * the source path when performing its rename * operation. */ - if (is_waiting_for_move(sctx, ow_inode)) { - wdm = get_waiting_dir_move(sctx, - ow_inode); - ASSERT(wdm); + wdm = get_waiting_dir_move(sctx, ow_inode); + if (wdm) wdm->orphanized = true; - } /* * Make sure we clear our orphanized inode's @@ -4306,10 +4312,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * and get instead the orphan name. */ nce = name_cache_search(sctx, ow_inode, ow_gen); - if (nce) { - name_cache_delete(sctx, nce); - kfree(nce); - } + if (nce) + btrfs_lru_cache_remove(&sctx->name_cache, + &nce->entry); /* * ow_inode might currently be an ancestor of @@ -4358,7 +4363,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * parent directory out of order. But we need to check if this * did already happen before due to other refs in the same dir. */ - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_will_create) { @@ -4388,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_create_inode(sctx, cur->dir); if (ret < 0) goto out; + cache_dir_created(sctx, cur->dir); } } @@ -4470,8 +4476,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * later, we do this check again and rmdir it then if possible. * See the use of check_dirs for more details. */ - ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, - sctx->cur_ino); + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen); if (ret < 0) goto out; if (ret) { @@ -4564,20 +4569,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (cur->dir > sctx->cur_ino) continue; - ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); if (ret < 0) goto out; if (ret == inode_state_did_create || ret == inode_state_no_change) { - /* TODO delayed utimes */ - ret = send_utimes(sctx, cur->dir, cur->dir_gen); + ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } else if (ret == inode_state_did_delete && cur->dir != last_dir_ino_rm) { - ret = can_rmdir(sctx, cur->dir, cur->dir_gen, - sctx->cur_ino); + ret = can_rmdir(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; if (ret) { @@ -5635,7 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * boundary in the send buffer. This means that there may be a gap * between the beginning of the command and the file data. */ - data_offset = ALIGN(sctx->send_size, PAGE_SIZE); + data_offset = PAGE_ALIGN(sctx->send_size); if (data_offset > sctx->send_max_size || sctx->send_max_size - data_offset < disk_num_bytes) { ret = -EOVERFLOW; @@ -5759,7 +5762,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, sent += size; } - if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { + if (sctx->clean_page_cache && PAGE_ALIGNED(end)) { /* * Always operate only on ranges that are a multiple of the page * size. This is not only to prevent zeroing parts of a page in @@ -6754,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) * it's moved/renamed, therefore we don't need to do it here. */ sctx->send_progress = sctx->cur_ino + 1; - ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + + /* + * If the current inode is a non-empty directory, delay issuing + * the utimes command for it, as it's very likely we have inodes + * with an higher number inside it. We want to issue the utimes + * command only after adding all dentries to it. + */ + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0) + ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + else + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) goto out; } out: + if (!ret) + ret = trim_dir_utimes_cache(sctx); + return ret; } @@ -8044,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) int clone_sources_to_rollback = 0; size_t alloc_size; int sort_clone_roots = 0; + struct btrfs_lru_cache_entry *entry; + struct btrfs_lru_cache_entry *tmp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -8073,10 +8092,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside - * access_ok. + * access_ok. Also set an upper limit for allocation size so this can't + * easily exhaust memory. Max number of clone sources is about 200K. */ - if (arg->clone_sources_count > - ULONG_MAX / sizeof(struct clone_root) - 1) { + if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) { ret = -EINVAL; goto out; } @@ -8094,11 +8113,22 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); - INIT_LIST_HEAD(&sctx->name_cache_list); - INIT_LIST_HEAD(&sctx->backref_cache.lru_list); - mt_init(&sctx->backref_cache.entries); + btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE); + btrfs_lru_cache_init(&sctx->dir_created_cache, + SEND_MAX_DIR_CREATED_CACHE_SIZE); + /* + * This cache is periodically trimmed to a fixed size elsewhere, see + * cache_dir_utimes() and trim_dir_utimes_cache(). + */ + btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0); + + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; + sctx->rbtree_new_refs = RB_ROOT; + sctx->rbtree_deleted_refs = RB_ROOT; sctx->flags = arg->flags; @@ -8165,12 +8195,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->pending_dir_moves = RB_ROOT; - sctx->waiting_dir_moves = RB_ROOT; - sctx->orphan_dirs = RB_ROOT; - sctx->rbtree_new_refs = RB_ROOT; - sctx->rbtree_deleted_refs = RB_ROOT; - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), arg->clone_sources_count + 1, GFP_KERNEL); @@ -8279,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) if (ret < 0) goto out; + btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) { + ret = send_utimes(sctx, entry->key, entry->gen); + if (ret < 0) + goto out; + btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry); + } + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { ret = begin_cmd(sctx, BTRFS_SEND_C_END); if (ret < 0) @@ -8358,11 +8389,12 @@ out: kvfree(sctx->send_buf); kvfree(sctx->verity_descriptor); - name_cache_free(sctx); - close_current_inode(sctx); - empty_backref_cache(sctx); + btrfs_lru_cache_clear(&sctx->name_cache); + btrfs_lru_cache_clear(&sctx->backref_cache); + btrfs_lru_cache_clear(&sctx->dir_created_cache); + btrfs_lru_cache_clear(&sctx->dir_utimes_cache); kfree(sctx); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 433ce221dc5c..581845bc206a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -58,6 +58,7 @@ #include "scrub.h" #include "verity.h" #include "super.h" +#include "extent-tree.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -2049,7 +2050,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } /* - * Metadata in mixed block goup profiles are accounted in data + * Metadata in mixed block group profiles are accounted in data */ if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 45615ce36498..8c5efa5813b3 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -702,7 +702,7 @@ static void release_raid_kobj(struct kobject *kobj) kfree(to_raid_kobj(kobj)); } -static struct kobj_type btrfs_raid_ktype = { +static const struct kobj_type btrfs_raid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = release_raid_kobj, .default_groups = raid_groups, @@ -900,7 +900,7 @@ static void space_info_release(struct kobject *kobj) kfree(sinfo); } -static struct kobj_type space_info_ktype = { +static const struct kobj_type space_info_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = space_info_release, .default_groups = space_info_groups, @@ -1259,7 +1259,7 @@ static void btrfs_release_fsid_kobj(struct kobject *kobj) complete(&fs_devs->kobj_unregister); } -static struct kobj_type btrfs_ktype = { +static const struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_fsid_kobj, }; @@ -1789,7 +1789,7 @@ static void btrfs_release_devid_kobj(struct kobject *kobj) complete(&device->kobj_unregister); } -static struct kobj_type devid_ktype = { +static const struct kobj_type devid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = devid_groups, .release = btrfs_release_devid_kobj, @@ -2103,7 +2103,7 @@ static void qgroups_release(struct kobject *kobj) kfree(kobj); } -static struct kobj_type qgroups_ktype = { +static const struct kobj_type qgroups_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = qgroups_groups, .release = qgroups_release, @@ -2173,7 +2173,7 @@ static void qgroup_release(struct kobject *kobj) memset(&qgroup->kobj, 0, sizeof(*kobj)); } -static struct kobj_type qgroup_ktype = { +static const struct kobj_type qgroup_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = qgroup_release, .default_groups = qgroup_groups, @@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, * Change per-fs features in /sys/fs/btrfs/UUID/features to match current * values in superblock. Call after any changes to incompat/compat_ro flags */ -void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, - u64 bit, enum btrfs_feature_set set) +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devs; struct kobject *fsid_kobj; - u64 __maybe_unused features; - int __maybe_unused ret; + int ret; if (!fs_info) return; - /* - * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not - * safe when called from some contexts (eg. balance) - */ - features = get_features(fs_info, set); - ASSERT(bit & supported_feature_masks[set]); - - fs_devs = fs_info->fs_devices; - fsid_kobj = &fs_devs->fsid_kobj; - + fsid_kobj = &fs_info->fs_devices->fsid_kobj; if (!fsid_kobj->state_initialized) return; - /* - * FIXME: this is too heavy to update just one value, ideally we'd like - * to use sysfs_update_group but some refactoring is needed first. - */ - sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); - ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); + ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group); + if (ret < 0) + btrfs_warn(fs_info, + "failed to update /sys/fs/btrfs/%pU/features: %d", + fs_info->fs_devices->fsid, ret); } int __init btrfs_init_sysfs(void) diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index bacef43f7267..86c7eef12873 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); -void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, - u64 bit, enum btrfs_feature_set set); +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info); void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); int __init btrfs_init_sysfs(void); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 181469fc0bb3..ca09cf9afce8 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -64,7 +64,7 @@ struct inode *btrfs_new_test_inode(void) BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - inode_init_owner(&init_user_ns, inode, NULL, S_IFREG); + inode_init_owner(&nop_mnt_idmap, inode, NULL, S_IFREG); return inode; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index c5b3a631bf4f..f2f2e11dac4c 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -509,7 +509,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b8c52e89688c..18329ebcb1cb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); + /* If we have features changed, wake up the cleaner to update sysfs. */ + if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) && + fs_info->cleaner_kthread) + wake_up_process(fs_info->cleaner_kthread); + ret = btrfs_write_and_wait_transaction(trans); if (ret) { btrfs_handle_fs_error(fs_info, ret, @@ -2604,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) return (ret < 0) ? 0 : 1; } +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + WRITE_ONCE(trans->aborted, errno); + WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); +} + int __init btrfs_transaction_init(void) { btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 97f6c39f59c8..fa728ab80826 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -202,6 +202,34 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } +bool __cold abort_should_print_stack(int errno); + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact stack trace is reported for some errors. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + bool first = false; \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_ERR \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno))) { \ + /* Stack trace printed. */ \ + } else { \ + btrfs_debug((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (errno)); \ + } \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno), first); \ +} while (0) + int btrfs_end_transaction(struct btrfs_trans_handle *trans); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items); @@ -236,6 +264,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit); int __init btrfs_transaction_init(void); void __cold btrfs_transaction_exit(void); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d43261545264..200cea6e49e5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -279,12 +279,6 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } -static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) -{ - filemap_fdatawait_range(buf->pages[0]->mapping, - buf->start, buf->start + buf->len - 1); -} - /* * the walk control struct is used to pass state down the chain when * processing the log tree. The stage field tells us which part @@ -2623,11 +2617,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, bytenr, blocksize); if (ret) { @@ -2637,8 +2632,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_redirty_list_add( trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); unaccount_log_buffer(fs_info, bytenr); } } @@ -2693,11 +2686,12 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[*level]; + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, path->nodes[*level]->start, path->nodes[*level]->len); @@ -2706,9 +2700,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_redirty_list_add(trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); - unaccount_log_buffer(fs_info, path->nodes[*level]->start); } @@ -2776,19 +2767,18 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[orig_level]; + btrfs_tree_lock(next); + btrfs_clear_buffer_dirty(trans, next); + wait_on_extent_buffer_writeback(next); + btrfs_tree_unlock(next); + if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); ret = btrfs_pin_reserved_extent(trans, next->start, next->len); if (ret) goto out; btrfs_redirty_list_add(trans->transaction, next); } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); unaccount_log_buffer(fs_info, next->start); } } @@ -3576,17 +3566,19 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, } static int flush_dir_items_batch(struct btrfs_trans_handle *trans, - struct btrfs_root *log, + struct btrfs_inode *inode, struct extent_buffer *src, struct btrfs_path *dst_path, int start_slot, int count) { + struct btrfs_root *log = inode->root->log_root; char *ins_data = NULL; struct btrfs_item_batch batch; struct extent_buffer *dst; unsigned long src_offset; unsigned long dst_offset; + u64 last_index; struct btrfs_key key; u32 item_size; int ret; @@ -3644,6 +3636,18 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); btrfs_release_path(dst_path); + + last_index = batch.keys[count - 1].offset; + ASSERT(last_index > inode->last_dir_index_offset); + + /* + * If for some unexpected reason the last item's index is not greater + * than the last index we logged, warn and force a transaction commit. + */ + if (WARN_ON(last_index <= inode->last_dir_index_offset)) + ret = BTRFS_LOG_FORCE_COMMIT; + else + inode->last_dir_index_offset = last_index; out: kfree(ins_data); @@ -3693,7 +3697,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, } di = btrfs_item_ptr(src, i, struct btrfs_dir_item); - ctx->last_dir_item_offset = key.offset; /* * Skip ranges of items that consist only of dir item keys created @@ -3756,7 +3759,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, if (batch_size > 0) { int ret; - ret = flush_dir_items_batch(trans, log, src, dst_path, + ret = flush_dir_items_batch(trans, inode, src, dst_path, batch_start, batch_size); if (ret < 0) return ret; @@ -3780,7 +3783,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_root *root = inode->root; struct btrfs_root *log = root->log_root; - int err = 0; int ret; u64 last_old_dentry_offset = min_offset - 1; u64 last_offset = (u64)-1; @@ -3821,8 +3823,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; - } else if (ret < 0) { - err = ret; + } else if (ret > 0) { + ret = 0; } goto done; @@ -3845,7 +3847,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; } else if (ret < 0) { - err = ret; goto done; } @@ -3867,12 +3868,15 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (ret > 0) + if (ret > 0) { ret = btrfs_next_item(root, path); + if (ret > 0) { + /* There are no more keys in the inode's root. */ + ret = 0; + goto done; + } + } if (ret < 0) - err = ret; - /* If ret is 1, there are no more keys in the inode's root. */ - if (ret != 0) goto done; /* @@ -3883,8 +3887,8 @@ search: ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, &last_old_dentry_offset); if (ret != 0) { - if (ret < 0) - err = ret; + if (ret > 0) + ret = 0; goto done; } path->slots[0] = btrfs_header_nritems(path->nodes[0]); @@ -3895,10 +3899,10 @@ search: */ ret = btrfs_next_leaf(root, path); if (ret) { - if (ret == 1) + if (ret == 1) { last_offset = (u64)-1; - else - err = ret; + ret = 0; + } goto done; } btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); @@ -3929,7 +3933,7 @@ done: btrfs_release_path(path); btrfs_release_path(dst_path); - if (err == 0) { + if (ret == 0) { *last_offset_ret = last_offset; /* * In case the leaf was changed in the current transaction but @@ -3940,15 +3944,13 @@ done: * a range, last_old_dentry_offset is == to last_offset. */ ASSERT(last_old_dentry_offset <= last_offset); - if (last_old_dentry_offset < last_offset) { + if (last_old_dentry_offset < last_offset) ret = insert_dir_log_key(trans, log, path, ino, last_old_dentry_offset + 1, last_offset); - if (ret) - err = ret; - } } - return err; + + return ret; } /* @@ -4044,7 +4046,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, min_key = BTRFS_DIR_START_INDEX; max_key = 0; - ctx->last_dir_item_offset = inode->last_dir_index_offset; while (1) { ret = log_dir_items(trans, inode, path, dst_path, @@ -4056,8 +4057,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, min_key = max_key + 1; } - inode->last_dir_index_offset = ctx->last_dir_item_offset; - return 0; } @@ -5593,10 +5592,8 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction * commits. */ - if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) { - btrfs_set_log_full_commit(trans); + if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) return BTRFS_LOG_FORCE_COMMIT; - } inode = btrfs_iget(root->fs_info->sb, ino, root); /* @@ -6455,7 +6452,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * result in losing the file after a log replay. */ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { - btrfs_set_log_full_commit(trans); ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 85b43075ac58..bdeb5216718f 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -13,8 +13,13 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 -/* We can't use the tree log for whatever reason, force a transaction commit */ -#define BTRFS_LOG_FORCE_COMMIT (1) +/* + * We can't use the tree log for whatever reason, force a transaction commit. + * We use a negative value because there are functions through the logging code + * that need to return an error (< 0 value), false (0) or true (1). Any negative + * value will do, as it will cause the log to be marked for a full sync. + */ +#define BTRFS_LOG_FORCE_COMMIT (-(MAX_ERRNO + 1)) struct btrfs_log_ctx { int log_ret; @@ -24,8 +29,6 @@ struct btrfs_log_ctx { bool logging_new_delayed_dentries; /* Indicate if the inode being logged was logged before. */ bool logged_before; - /* Tracks the last logged dir item/index key offset. */ - u64 last_dir_item_offset; struct inode *inode; struct list_head list; /* Only used for fast fsyncs. */ diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index bf9eb693a6a7..c5ff16f9e9fa 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -783,30 +783,25 @@ again: /* * fsverity op that writes a Merkle tree block into the btree. * - * @inode: inode to write a Merkle tree block for - * @buf: Merkle tree data block to write - * @index: index of the block in the Merkle tree - * @log_blocksize: log base 2 of the Merkle tree block size - * - * Note that the block size could be different from the page size, so it is not - * safe to assume that index is a page index. + * @inode: inode to write a Merkle tree block for + * @buf: Merkle tree block to write + * @pos: the position of the block in the Merkle tree (in bytes) + * @size: the Merkle tree block size (in bytes) * * Returns 0 on success or negative error code on failure */ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf, - u64 index, int log_blocksize) + u64 pos, unsigned int size) { - u64 off = index << log_blocksize; - u64 len = 1ULL << log_blocksize; loff_t merkle_pos = merkle_file_pos(inode); if (merkle_pos < 0) return merkle_pos; - if (merkle_pos > inode->i_sb->s_maxbytes - off - len) + if (merkle_pos > inode->i_sb->s_maxbytes - pos - size) return -EFBIG; return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, - off, buf, len); + pos, buf, size); } const struct fsverity_operations btrfs_verityops = { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bcfef75b97da..7823168c08a6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -403,6 +403,7 @@ void btrfs_free_device(struct btrfs_device *device) static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; + WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { device = list_entry(fs_devices->devices.next, @@ -727,7 +728,7 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata( /* * Handle the case where the scanned device is part of an fs whose last * metadata UUID change reverted it to the original FSID. At the same - * time * fs_devices was first created by another constitutent device + * time fs_devices was first created by another constituent device * which didn't fully observe the operation. This results in an * btrfs_fs_devices created with metadata/fsid different AND * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the @@ -1181,9 +1182,22 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&uuid_mutex); close_fs_devices(fs_devices); - if (!fs_devices->opened) + if (!fs_devices->opened) { list_splice_init(&fs_devices->seed_list, &list); + /* + * If the struct btrfs_fs_devices is not assembled with any + * other device, it can be re-initialized during the next mount + * without the needing device-scan step. Therefore, it can be + * fully freed. + */ + if (fs_devices->num_devices == 1) { + list_del(&fs_devices->fs_list); + free_fs_devices(fs_devices); + } + } + + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); list_del(&fs_devices->seed_list); @@ -1600,7 +1614,7 @@ again: if (ret < 0) goto out; - while (1) { + while (search_start < search_end) { l = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(l)) { @@ -1623,6 +1637,9 @@ again: if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; + if (key.offset > search_end) + break; + if (key.offset > search_start) { hole_size = key.offset - search_start; dev_extent_hole_check(device, &search_start, &hole_size, @@ -1683,6 +1700,7 @@ next: else ret = 0; + ASSERT(max_hole_start + max_hole_size <= search_end); out: btrfs_free_path(path); *start = max_hole_start; @@ -6266,91 +6284,42 @@ static bool need_full_stripe(enum btrfs_map_op op) return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); } -/* - * Calculate the geometry of a particular (address, len) tuple. This - * information is used to calculate how big a particular bio can get before it - * straddles a stripe. - * - * @fs_info: the filesystem - * @em: mapping containing the logical extent - * @op: type of operation - write or read - * @logical: address that we want to figure out the geometry of - * @io_geom: pointer used to return values - * - * Returns < 0 in case a chunk for the given logical address cannot be found, - * usually shouldn't happen unless @logical is corrupted, 0 otherwise. - */ -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, - enum btrfs_map_op op, u64 logical, - struct btrfs_io_geometry *io_geom) +static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, + u64 offset, u64 *stripe_nr, u64 *stripe_offset, + u64 *full_stripe_start) { - struct map_lookup *map; - u64 len; - u64 offset; - u64 stripe_offset; - u64 stripe_nr; - u32 stripe_len; - u64 raid56_full_stripe_start = (u64)-1; - int data_stripes; + u32 stripe_len = map->stripe_len; ASSERT(op != BTRFS_MAP_DISCARD); - map = em->map_lookup; - /* Offset of this logical address in the chunk */ - offset = logical - em->start; - /* Len of a stripe in a chunk */ - stripe_len = map->stripe_len; /* - * Stripe_nr is where this block falls in - * stripe_offset is the offset of this block in its stripe. + * Stripe_nr is the stripe where this block falls. stripe_offset is + * the offset of this block in its stripe. */ - stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); - ASSERT(stripe_offset < U32_MAX); + *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset); + ASSERT(*stripe_offset < U32_MAX); - data_stripes = nr_data_stripes(map); + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); - /* Only stripe based profiles needs to check against stripe length. */ - if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { - u64 max_len = stripe_len - stripe_offset; + *full_stripe_start = + div64_u64(offset, full_stripe_len) * full_stripe_len; /* - * In case of raid56, we need to know the stripe aligned start + * For writes to RAID56, allow to write a full stripe set, but + * no straddling of stripe sets. */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * data_stripes; - raid56_full_stripe_start = offset; - - /* - * Allow a write of a full stripe, but make sure we - * don't allow straddling of stripes - */ - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, - full_stripe_len); - raid56_full_stripe_start *= full_stripe_len; - - /* - * For writes to RAID[56], allow a full stripeset across - * all disks. For other RAID types and for RAID[56] - * reads, just allow a single stripe (on a single disk). - */ - if (op == BTRFS_MAP_WRITE) { - max_len = stripe_len * data_stripes - - (offset - raid56_full_stripe_start); - } - } - len = min_t(u64, em->len - offset, max_len); - } else { - len = em->len - offset; + if (op == BTRFS_MAP_WRITE) + return full_stripe_len - (offset - *full_stripe_start); } - io_geom->len = len; - io_geom->offset = offset; - io_geom->stripe_len = stripe_len; - io_geom->stripe_nr = stripe_nr; - io_geom->stripe_offset = stripe_offset; - io_geom->raid56_stripe_offset = raid56_full_stripe_start; - - return 0; + /* + * For other RAID types and for RAID56 reads, allow a single stripe (on + * a single disk). + */ + if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) + return stripe_len - *stripe_offset; + return U64_MAX; } static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, @@ -6369,6 +6338,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, { struct extent_map *em; struct map_lookup *map; + u64 map_offset; u64 stripe_offset; u64 stripe_nr; u64 stripe_len; @@ -6387,7 +6357,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; - struct btrfs_io_geometry geom; + u64 max_len; ASSERT(bioc_ret); ASSERT(op != BTRFS_MAP_DISCARD); @@ -6395,18 +6365,14 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, em = btrfs_get_chunk_map(fs_info, logical, *length); ASSERT(!IS_ERR(em)); - ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); - if (ret < 0) - return ret; - map = em->map_lookup; - - *length = geom.len; - stripe_len = geom.stripe_len; - stripe_nr = geom.stripe_nr; - stripe_offset = geom.stripe_offset; - raid56_full_stripe_start = geom.raid56_stripe_offset; data_stripes = nr_data_stripes(map); + stripe_len = map->stripe_len; + + map_offset = logical - em->start; + max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, + &stripe_offset, &raid56_full_stripe_start); + *length = min_t(u64, em->len - map_offset, max_len); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6b7a05f6cf82..7e51f2238f72 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -53,21 +53,6 @@ enum btrfs_raid_types { BTRFS_NR_RAID_TYPES }; -struct btrfs_io_geometry { - /* remaining bytes before crossing a stripe */ - u64 len; - /* offset of logical address in chunk */ - u64 offset; - /* length of single IO stripe */ - u32 stripe_len; - /* offset of address in stripe */ - u32 stripe_offset; - /* number of stripe where address falls */ - u64 stripe_nr; - /* offset of raid56 stripe into the chunk */ - u64 raid56_stripe_offset; -}; - /* * Use sequence counter to get consistent device stat data on * 32-bit processors. @@ -545,9 +530,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes); -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, - enum btrfs_map_op op, u64 logical, - struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 0ed4b119a7ca..0ebeaf4e81f9 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -370,7 +370,7 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -383,7 +383,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 01a13de11832..da7bb9187b68 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -63,7 +63,7 @@ struct list_head *zlib_alloc_workspace(unsigned int level) workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); - workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); + workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL); workspace->level = level; workspace->buf = NULL; /* diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1f503e8e42d4..f95b2c94d619 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -17,6 +17,7 @@ #include "space-info.h" #include "fs.h" #include "accessors.h" +#include "bio.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -160,7 +161,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, */ static inline u32 sb_zone_number(int shift, int mirror) { - u64 zone; + u64 zone = U64_MAX; ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { @@ -220,7 +221,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { struct btrfs_zoned_device_info *zinfo = device->zone_info; - u32 zno; int ret; if (!*nr_zones) @@ -235,6 +235,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, /* Check cache */ if (zinfo->zone_cache) { unsigned int i; + u32 zno; ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); zno = pos >> zinfo->zone_size_shift; @@ -274,9 +275,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return -EIO; /* Populate cache */ - if (zinfo->zone_cache) + if (zinfo->zone_cache) { + u32 zno = pos >> zinfo->zone_size_shift; + memcpy(zinfo->zone_cache + zno, zones, sizeof(*zinfo->zone_cache) * *nr_zones); + } return 0; } @@ -417,25 +421,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); - /* - * We limit max_zone_append_size also by max_segments * - * PAGE_SIZE. Technically, we can have multiple pages per segment. But, - * since btrfs adds the pages one by one to a bio, and btrfs cannot - * increase the metadata reservation even if it increases the number of - * extents, it is safe to stick with the limit. - * - * With the zoned emulation, we can have non-zoned device on the zoned - * mode. In this case, we don't have a valid max zone append size. So, - * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. - */ - if (bdev_is_zoned(bdev)) { - zone_info->max_zone_append_size = min_t(u64, - (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, - (u64)bdev_max_segments(bdev) << PAGE_SHIFT); - } else { - zone_info->max_zone_append_size = - (u64)bdev_max_segments(bdev) << PAGE_SHIFT; - } if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -715,9 +700,9 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) { + struct queue_limits *lim = &fs_info->limits; struct btrfs_device *device; u64 zone_size = 0; - u64 max_zone_append_size = 0; int ret; /* @@ -727,6 +712,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) if (!btrfs_fs_incompat(fs_info, ZONED)) return btrfs_check_for_zoned_device(fs_info); + blk_set_stacking_limits(lim); + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { struct btrfs_zoned_device_info *zone_info = device->zone_info; @@ -741,10 +728,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) zone_info->zone_size, zone_size); return -EINVAL; } - if (!max_zone_append_size || - (zone_info->max_zone_append_size && - zone_info->max_zone_append_size < max_zone_append_size)) - max_zone_append_size = zone_info->max_zone_append_size; + + /* + * With the zoned emulation, we can have non-zoned device on the + * zoned mode. In this case, we don't have a valid max zone + * append size. + */ + if (bdev_is_zoned(device->bdev)) { + blk_stack_limits(lim, + &bdev_get_queue(device->bdev)->limits, + 0); + } } /* @@ -765,8 +759,18 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; - fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, - fs_info->sectorsize); + /* + * Also limit max_zone_append_size by max_segments * PAGE_SIZE. + * Technically, we can have multiple pages per segment. But, since + * we add the pages one by one to a bio, and cannot increase the + * metadata reservation even if it increases the number of extents, it + * is safe to stick with the limit. + */ + fs_info->max_zone_append_size = ALIGN_DOWN( + min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT, + (u64)lim->max_sectors << SECTOR_SHIFT, + (u64)lim->max_segments << PAGE_SHIFT), + fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; if (fs_info->max_zone_append_size < fs_info->max_extent_size) fs_info->max_extent_size = fs_info->max_zone_append_size; @@ -1623,8 +1627,10 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) spin_unlock(&trans->releasing_ebs_lock); } -bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) +bool btrfs_use_zone_append(struct btrfs_bio *bbio) { + u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_group *cache; bool ret = false; @@ -1635,6 +1641,9 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!is_data_inode(&inode->vfs_inode)) return false; + if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) + return false; + /* * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the * extent layout the relocation code has. @@ -1657,22 +1666,16 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) return ret; } -void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, - struct bio *bio) +void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { + const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; struct btrfs_ordered_extent *ordered; - const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - if (bio_op(bio) != REQ_OP_ZONE_APPEND) - return; - - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); + ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); if (WARN_ON(!ordered)) return; ordered->physical = physical; - ordered->bdev = bio->bi_bdev; - btrfs_put_ordered_extent(ordered); } @@ -1684,43 +1687,46 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) struct extent_map *em; struct btrfs_ordered_sum *sum; u64 orig_logical = ordered->disk_bytenr; - u64 *logical = NULL; - int nr, stripe_len; + struct map_lookup *map; + u64 physical = ordered->physical; + u64 chunk_start_phys; + u64 logical; - /* Zoned devices should not have partitions. So, we can assume it is 0 */ - ASSERT(!bdev_is_partition(ordered->bdev)); - if (WARN_ON(!ordered->bdev)) + em = btrfs_get_chunk_map(fs_info, orig_logical, 1); + if (IS_ERR(em)) return; + map = em->map_lookup; + chunk_start_phys = map->stripes[0].physical; - if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev, - ordered->physical, &logical, &nr, - &stripe_len))) - goto out; - - WARN_ON(nr != 1); + if (WARN_ON_ONCE(map->num_stripes > 1) || + WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) || + WARN_ON_ONCE(physical < chunk_start_phys) || + WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) { + free_extent_map(em); + return; + } + logical = em->start + (physical - map->stripes[0].physical); + free_extent_map(em); - if (orig_logical == *logical) - goto out; + if (orig_logical == logical) + return; - ordered->disk_bytenr = *logical; + ordered->disk_bytenr = logical; em_tree = &inode->extent_tree; write_lock(&em_tree->lock); em = search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); - em->block_start = *logical; + em->block_start = logical; free_extent_map(em); write_unlock(&em_tree->lock); list_for_each_entry(sum, &ordered->list, list) { - if (*logical < orig_logical) - sum->bytenr -= orig_logical - *logical; + if (logical < orig_logical) + sum->bytenr -= orig_logical - logical; else - sum->bytenr += *logical - orig_logical; + sum->bytenr += logical - orig_logical; } - -out: - kfree(logical); } bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, @@ -1845,26 +1851,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); } -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) -{ - struct btrfs_device *device; - struct extent_map *em; - struct map_lookup *map; - - em = btrfs_get_chunk_map(fs_info, logical, length); - if (IS_ERR(em)) - return ERR_CAST(em); - - map = em->map_lookup; - /* We only support single profile for now */ - device = map->stripes[0].dev; - - free_extent_map(em); - - return device; -} - /* * Activate block group and underlying device zones * diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index f43990985d80..c0570d35fea2 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -20,7 +20,6 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; - u64 max_zone_append_size; u32 nr_zones; unsigned int max_active_zones; atomic_t active_zones_left; @@ -56,9 +55,8 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); -bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start); -void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, - struct bio *bio); +bool btrfs_use_zone_append(struct btrfs_bio *bbio); +void btrfs_record_physical_zoned(struct btrfs_bio *bbio); void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, @@ -68,8 +66,6 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, - u64 logical, u64 length); bool btrfs_zone_activate(struct btrfs_block_group *block_group); int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); @@ -185,13 +181,12 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } -static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) +static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) { return false; } -static inline void btrfs_record_physical_zoned(struct inode *inode, - u64 file_offset, struct bio *bio) +static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { } @@ -224,13 +219,6 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, return -EOPNOTSUPP; } -static inline struct btrfs_device *btrfs_zoned_get_device( - struct btrfs_fs_info *fs_info, - u64 logical, u64 length) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) { return true; diff --git a/fs/buffer.c b/fs/buffer.c index d9c6d1fbb6dd..9e1e2add541e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -48,6 +48,7 @@ #include <linux/sched/mm.h> #include <trace/events/block.h> #include <linux/fscrypt.h> +#include <linux/fsverity.h> #include "internal.h" @@ -60,7 +61,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, inline void touch_buffer(struct buffer_head *bh) { trace_block_touch_buffer(bh); - mark_page_accessed(bh->b_page); + folio_mark_accessed(bh->b_folio); } EXPORT_SYMBOL(touch_buffer); @@ -246,18 +247,18 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; - struct page *page; - int page_uptodate = 1; + struct folio *folio; + int folio_uptodate = 1; BUG_ON(!buffer_async_read(bh)); - page = bh->b_page; + folio = bh->b_folio; if (uptodate) { set_buffer_uptodate(bh); } else { clear_buffer_uptodate(bh); buffer_io_error(bh, ", async page read"); - SetPageError(page); + folio_set_error(folio); } /* @@ -265,14 +266,14 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) * two buffer heads end IO at almost the same time and both * decide that the page is now completely done. */ - first = page_buffers(page); + first = folio_buffers(folio); spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; do { if (!buffer_uptodate(tmp)) - page_uptodate = 0; + folio_uptodate = 0; if (buffer_async_read(tmp)) { BUG_ON(!buffer_locked(tmp)); goto still_busy; @@ -285,9 +286,9 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) * If all of the buffers are uptodate then we can set the page * uptodate. */ - if (page_uptodate) - SetPageUptodate(page); - unlock_page(page); + if (folio_uptodate) + folio_mark_uptodate(folio); + folio_unlock(folio); return; still_busy: @@ -295,20 +296,53 @@ still_busy: return; } -struct decrypt_bh_ctx { +struct postprocess_bh_ctx { struct work_struct work; struct buffer_head *bh; }; +static void verify_bh(struct work_struct *work) +{ + struct postprocess_bh_ctx *ctx = + container_of(work, struct postprocess_bh_ctx, work); + struct buffer_head *bh = ctx->bh; + bool valid; + + valid = fsverity_verify_blocks(page_folio(bh->b_page), bh->b_size, + bh_offset(bh)); + end_buffer_async_read(bh, valid); + kfree(ctx); +} + +static bool need_fsverity(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + struct inode *inode = page->mapping->host; + + return fsverity_active(inode) && + /* needed by ext4 */ + page->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + static void decrypt_bh(struct work_struct *work) { - struct decrypt_bh_ctx *ctx = - container_of(work, struct decrypt_bh_ctx, work); + struct postprocess_bh_ctx *ctx = + container_of(work, struct postprocess_bh_ctx, work); struct buffer_head *bh = ctx->bh; int err; - err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size, - bh_offset(bh)); + err = fscrypt_decrypt_pagecache_blocks(page_folio(bh->b_page), + bh->b_size, bh_offset(bh)); + if (err == 0 && need_fsverity(bh)) { + /* + * We use different work queues for decryption and for verity + * because verity may require reading metadata pages that need + * decryption, and we shouldn't recurse to the same workqueue. + */ + INIT_WORK(&ctx->work, verify_bh); + fsverity_enqueue_verify_work(&ctx->work); + return; + } end_buffer_async_read(bh, err == 0); kfree(ctx); } @@ -319,15 +353,24 @@ static void decrypt_bh(struct work_struct *work) */ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) { - /* Decrypt if needed */ - if (uptodate && - fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) { - struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); + struct inode *inode = bh->b_folio->mapping->host; + bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode); + bool verify = need_fsverity(bh); + + /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */ + if (uptodate && (decrypt || verify)) { + struct postprocess_bh_ctx *ctx = + kmalloc(sizeof(*ctx), GFP_ATOMIC); if (ctx) { - INIT_WORK(&ctx->work, decrypt_bh); ctx->bh = bh; - fscrypt_enqueue_decrypt_work(&ctx->work); + if (decrypt) { + INIT_WORK(&ctx->work, decrypt_bh); + fscrypt_enqueue_decrypt_work(&ctx->work); + } else { + INIT_WORK(&ctx->work, verify_bh); + fsverity_enqueue_verify_work(&ctx->work); + } return; } uptodate = 0; @@ -344,21 +387,21 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; - struct page *page; + struct folio *folio; BUG_ON(!buffer_async_write(bh)); - page = bh->b_page; + folio = bh->b_folio; if (uptodate) { set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost async page write"); mark_buffer_write_io_error(bh); clear_buffer_uptodate(bh); - SetPageError(page); + folio_set_error(folio); } - first = page_buffers(page); + first = folio_buffers(folio); spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_write(bh); @@ -372,7 +415,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) tmp = tmp->b_this_page; } spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - end_page_writeback(page); + folio_end_writeback(folio); return; still_busy: @@ -570,7 +613,7 @@ void write_boundary_block(struct block_device *bdev, void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) { struct address_space *mapping = inode->i_mapping; - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = bh->b_folio->mapping; mark_buffer_dirty(bh); if (!mapping->private_data) { @@ -1073,7 +1116,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * and then attach the address_space's inode to its superblock's dirty * inode list. * - * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, + * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock, * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1095,16 +1138,16 @@ void mark_buffer_dirty(struct buffer_head *bh) } if (!test_set_buffer_dirty(bh)) { - struct page *page = bh->b_page; + struct folio *folio = bh->b_folio; struct address_space *mapping = NULL; - lock_page_memcg(page); - if (!TestSetPageDirty(page)) { - mapping = page_mapping(page); + folio_memcg_lock(folio); + if (!folio_test_set_dirty(folio)) { + mapping = folio->mapping; if (mapping) - __set_page_dirty(page, mapping, 0); + __folio_mark_dirty(folio, mapping, 0); } - unlock_page_memcg(page); + folio_memcg_unlock(folio); if (mapping) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } @@ -1117,8 +1160,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh) set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ - if (bh->b_page && bh->b_page->mapping) - mapping_set_error(bh->b_page->mapping, -EIO); + if (bh->b_folio && bh->b_folio->mapping) + mapping_set_error(bh->b_folio->mapping, -EIO); if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); rcu_read_lock(); @@ -1154,7 +1197,7 @@ void __bforget(struct buffer_head *bh) { clear_buffer_dirty(bh); if (bh->b_assoc_map) { - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = bh->b_folio->mapping; spin_lock(&buffer_mapping->private_lock); list_del_init(&bh->b_assoc_buffers); @@ -2245,6 +2288,11 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) int nr, i; int fully_mapped = 1; bool page_error = false; + loff_t limit = i_size_read(inode); + + /* This is needed for ext4. */ + if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) + limit = inode->i_sb->s_maxbytes; VM_BUG_ON_FOLIO(folio_test_large(folio), folio); @@ -2253,7 +2301,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) bbits = block_size_bits(blocksize); iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits); - lblock = (i_size_read(inode)+blocksize-1) >> bbits; + lblock = (limit+blocksize-1) >> bbits; bh = head; nr = 0; i = 0; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index a69073a1d3f0..40052bdb3365 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -138,7 +138,7 @@ static int cachefiles_adjust_size(struct cachefiles_object *object) newattrs.ia_size = oi_size & PAGE_MASK; ret = cachefiles_inject_remove_error(); if (ret == 0) - ret = notify_change(&init_user_ns, file->f_path.dentry, + ret = notify_change(&nop_mnt_idmap, file->f_path.dentry, &newattrs, NULL); if (ret < 0) goto truncate_failed; @@ -148,7 +148,7 @@ static int cachefiles_adjust_size(struct cachefiles_object *object) newattrs.ia_size = ni_size; ret = cachefiles_inject_write_error(); if (ret == 0) - ret = notify_change(&init_user_ns, file->f_path.dentry, + ret = notify_change(&nop_mnt_idmap, file->f_path.dentry, &newattrs, NULL); truncate_failed: diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 03ca8f2f657a..82219a8f6084 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -130,7 +130,7 @@ retry: goto mkdir_error; ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700); + ret = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); if (ret < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, cachefiles_trace_mkdir_error); @@ -245,7 +245,7 @@ static int cachefiles_unlink(struct cachefiles_cache *cache, ret = cachefiles_inject_remove_error(); if (ret == 0) { - ret = vfs_unlink(&init_user_ns, d_backing_inode(dir), dentry, NULL); + ret = vfs_unlink(&nop_mnt_idmap, d_backing_inode(dir), dentry, NULL); if (ret == -EIO) cachefiles_io_error(cache, "Unlink failed"); } @@ -382,10 +382,10 @@ try_again: cachefiles_io_error(cache, "Rename security error %d", ret); } else { struct renamedata rd = { - .old_mnt_userns = &init_user_ns, + .old_mnt_idmap = &nop_mnt_idmap, .old_dir = d_inode(dir), .old_dentry = rep, - .new_mnt_userns = &init_user_ns, + .new_mnt_idmap = &nop_mnt_idmap, .new_dir = d_inode(cache->graveyard), .new_dentry = grave, }; @@ -451,7 +451,7 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) ret = cachefiles_inject_write_error(); if (ret == 0) { - file = vfs_tmpfile_open(&init_user_ns, &parentpath, S_IFREG, + file = vfs_tmpfile_open(&nop_mnt_idmap, &parentpath, S_IFREG, O_RDWR | O_LARGEFILE | O_DIRECT, cache->cache_cred); ret = PTR_ERR_OR_ZERO(file); @@ -714,7 +714,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, ret = cachefiles_inject_read_error(); if (ret == 0) - ret = vfs_link(object->file->f_path.dentry, &init_user_ns, + ret = vfs_link(object->file->f_path.dentry, &nop_mnt_idmap, d_inode(fan), dentry, NULL); if (ret < 0) { trace_cachefiles_vfs_error(object, d_inode(fan), ret, diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 00b087c14995..bcb6173943ee 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -65,7 +65,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object) ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, sizeof(struct cachefiles_xattr) + len, 0); if (ret < 0) { trace_cachefiles_vfs_error(object, file_inode(file), ret, @@ -108,7 +108,7 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file xlen = cachefiles_inject_read_error(); if (xlen == 0) - xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, tlen); + xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, tlen); if (xlen != tlen) { if (xlen < 0) trace_cachefiles_vfs_error(object, file_inode(file), xlen, @@ -150,7 +150,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, ret = cachefiles_inject_remove_error(); if (ret == 0) - ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache); + ret = vfs_removexattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache); if (ret < 0) { trace_cachefiles_vfs_error(object, d_inode(dentry), ret, cachefiles_trace_remxattr_error); @@ -207,7 +207,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, len, 0); if (ret < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret, @@ -249,7 +249,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) xlen = cachefiles_inject_read_error(); if (xlen == 0) - xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, buf, len); + xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, len); if (xlen != len) { if (xlen < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen, diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index c7e8dd5b58d4..6945a938d396 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -85,7 +85,7 @@ retry: return acl; } -int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int ret = 0, size = 0; @@ -105,7 +105,7 @@ int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - ret = posix_acl_update_mode(&init_user_ns, inode, + ret = posix_acl_update_mode(&nop_mnt_idmap, inode, &new_mode, &acl); if (ret) goto out; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c74871e37c9..d5335f445233 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -305,7 +305,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_request *req; + struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); struct iov_iter iter; struct page **pages; @@ -313,6 +313,11 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) int err = 0; u64 len = subreq->len; + if (ceph_inode_is_shutdown(inode)) { + err = -EIO; + goto out; + } + if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; @@ -563,6 +568,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p idx %lu\n", page, page->index); + if (ceph_inode_is_shutdown(inode)) + return -EIO; + /* verify this is a writeable snap context */ snapc = page_snap_context(page); if (!snapc) { @@ -792,7 +800,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start_index, end = -1; struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; - struct pagevec pvec; + struct folio_batch fbatch; int rc = 0; unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; @@ -821,7 +829,7 @@ static int ceph_writepages_start(struct address_space *mapping, if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - pagevec_init(&pvec); + folio_batch_init(&fbatch); start_index = wbc->range_cyclic ? mapping->writeback_index : 0; index = start_index; @@ -869,7 +877,7 @@ retry: while (!done && index <= end) { int num_ops = 0, op_idx; - unsigned i, pvec_pages, max_pages, locked_pages = 0; + unsigned i, nr_folios, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; struct page *page; pgoff_t strip_unit_end = 0; @@ -879,13 +887,13 @@ retry: max_pages = wsize >> PAGE_SHIFT; get_more_pages: - pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_DIRTY); - dout("pagevec_lookup_range_tag got %d\n", pvec_pages); - if (!pvec_pages && !locked_pages) + nr_folios = filemap_get_folios_tag(mapping, &index, + end, PAGECACHE_TAG_DIRTY, &fbatch); + dout("pagevec_lookup_range_tag got %d\n", nr_folios); + if (!nr_folios && !locked_pages) break; - for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { - page = pvec.pages[i]; + for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { + page = &fbatch.folios[i]->page; dout("? %p idx %lu\n", page, page->index); if (locked_pages == 0) lock_page(page); /* first page */ @@ -995,7 +1003,7 @@ get_more_pages: len = 0; } - /* note position of first page in pvec */ + /* note position of first page in fbatch */ dout("%p will write page %p idx %lu\n", inode, page, page->index); @@ -1005,30 +1013,30 @@ get_more_pages: fsc->write_congested = true; pages[locked_pages++] = page; - pvec.pages[i] = NULL; + fbatch.folios[i] = NULL; len += thp_size(page); } /* did we get anything? */ if (!locked_pages) - goto release_pvec_pages; + goto release_folios; if (i) { unsigned j, n = 0; - /* shift unused page to beginning of pvec */ - for (j = 0; j < pvec_pages; j++) { - if (!pvec.pages[j]) + /* shift unused page to beginning of fbatch */ + for (j = 0; j < nr_folios; j++) { + if (!fbatch.folios[j]) continue; if (n < j) - pvec.pages[n] = pvec.pages[j]; + fbatch.folios[n] = fbatch.folios[j]; n++; } - pvec.nr = n; + fbatch.nr = n; - if (pvec_pages && i == pvec_pages && + if (nr_folios && i == nr_folios && locked_pages < max_pages) { - dout("reached end pvec, trying for more\n"); - pagevec_release(&pvec); + dout("reached end fbatch, trying for more\n"); + folio_batch_release(&fbatch); goto get_more_pages; } } @@ -1164,10 +1172,10 @@ new_request: if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) done = true; -release_pvec_pages: - dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, - pvec.nr ? pvec.pages[0] : NULL); - pagevec_release(&pvec); +release_folios: + dout("folio_batch release on %d folios (%p)\n", (int)fbatch.nr, + fbatch.nr ? fbatch.folios[0] : NULL); + folio_batch_release(&fbatch); } if (should_loop && !done) { @@ -1184,15 +1192,17 @@ release_pvec_pages: unsigned i, nr; index = 0; while ((index <= end) && - (nr = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK))) { + (nr = filemap_get_folios_tag(mapping, &index, + (pgoff_t)-1, + PAGECACHE_TAG_WRITEBACK, + &fbatch))) { for (i = 0; i < nr; i++) { - page = pvec.pages[i]; + page = &fbatch.folios[i]->page; if (page_snap_context(page) != snapc) continue; wait_on_page_writeback(page); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } @@ -1643,7 +1653,7 @@ int ceph_uninline_data(struct file *file) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req = NULL; - struct ceph_cap_flush *prealloc_cf; + struct ceph_cap_flush *prealloc_cf = NULL; struct folio *folio = NULL; u64 inline_version = CEPH_INLINE_NONE; struct page *pages[1]; @@ -1657,6 +1667,11 @@ int ceph_uninline_data(struct file *file) dout("uninline_data %p %llx.%llx inline_version %llu\n", inode, ceph_vinop(inode), inline_version); + if (ceph_inode_is_shutdown(inode)) { + err = -EIO; + goto out; + } + if (inline_version == CEPH_INLINE_NONE) return 0; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f75ad432f375..7cc20772eac9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -9,6 +9,7 @@ #include <linux/wait.h> #include <linux/writeback.h> #include <linux/iversion.h> +#include <linux/filelock.h> #include "super.h" #include "mds_client.h" @@ -4078,6 +4079,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, void *p, *end; struct cap_extra_info extra_info = {}; bool queue_trunc; + bool close_sessions = false; dout("handle_caps from mds%d\n", session->s_mds); @@ -4215,9 +4217,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, realm = NULL; if (snaptrace_len) { down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, - snaptrace + snaptrace_len, - false, &realm); + if (ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, + false, &realm)) { + up_write(&mdsc->snap_rwsem); + close_sessions = true; + goto done; + } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -4277,6 +4283,11 @@ done_unlocked: iput(inode); out: ceph_put_string(extra_info.pool_ns); + + /* Defer closing the sessions after s_mutex lock being released */ + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); + return; flush_cap_releases: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 6c7026cc8988..0ced8b570e42 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -845,7 +845,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) return PTR_ERR(result); } -static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); @@ -905,13 +905,13 @@ out: return err; } -static int ceph_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ceph_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return ceph_mknod(mnt_userns, dir, dentry, mode, 0); + return ceph_mknod(idmap, dir, dentry, mode, 0); } -static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *dest) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); @@ -970,7 +970,7 @@ out: return err; } -static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); @@ -1269,7 +1269,7 @@ out: return err; } -static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 764598e1efd9..5dcc62e678c4 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -103,14 +103,10 @@ static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize, size += bytes; for ( ; bytes; idx++, bvec_idx++) { - struct bio_vec bv = { - .bv_page = pages[idx], - .bv_len = min_t(int, bytes, PAGE_SIZE - start), - .bv_offset = start, - }; - - bvecs[bvec_idx] = bv; - bytes -= bv.bv_len; + int len = min_t(int, bytes, PAGE_SIZE - start); + + bvec_set_page(&bvecs[bvec_idx], pages[idx], len, start); + bytes -= len; start = 0; } } @@ -2011,6 +2007,9 @@ static int ceph_zero_partial_object(struct inode *inode, loff_t zero = 0; int op; + if (ceph_inode_is_shutdown(inode)) + return -EIO; + if (!length) { op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; length = &zero; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 23d05ec87fcc..8e5f41d45283 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2227,7 +2227,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) /* * setattr */ -int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -2240,7 +2240,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (ceph_inode_is_shutdown(inode)) return -ESTALE; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err != 0) return err; @@ -2255,7 +2255,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, err = __ceph_setattr(inode, attr); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) - err = posix_acl_chmod(&init_user_ns, dentry, attr->ia_mode); + err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode); return err; } @@ -2397,7 +2397,7 @@ out: * Check inode permissions. We verify we have a valid value for * the AUTH cap, then call the generic handler. */ -int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int err; @@ -2408,7 +2408,7 @@ int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode, err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); if (!err) - err = generic_permission(&init_user_ns, inode, mask); + err = generic_permission(&nop_mnt_idmap, inode, mask); return err; } @@ -2417,10 +2417,10 @@ static int statx_to_caps(u32 want, umode_t mode) { int mask = 0; - if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_AUTH_SHARED; - if (want & (STATX_NLINK|STATX_CTIME)) { + if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) { /* * The link count for directories depends on inode->i_subdirs, * and that is only updated when Fs caps are held. @@ -2431,11 +2431,10 @@ static int statx_to_caps(u32 want, umode_t mode) mask |= CEPH_CAP_LINK_SHARED; } - if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| - STATX_BLOCKS)) + if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_FILE_SHARED; - if (want & (STATX_CTIME)) + if (want & (STATX_CTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_XATTR_SHARED; return mask; @@ -2445,7 +2444,7 @@ static int statx_to_caps(u32 want, umode_t mode) * Get all the attributes. If we have sufficient caps for the requested attrs, * then we can avoid talking to the MDS at all. */ -int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); @@ -2466,7 +2465,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, return err; } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->ino = ceph_present_inode(inode); /* @@ -2478,6 +2477,11 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, valid_mask |= STATX_BTIME; } + if (request_mask & STATX_CHANGE_COOKIE) { + stat->change_cookie = inode_peek_iversion_raw(inode); + valid_mask |= STATX_CHANGE_COOKIE; + } + if (ceph_snap(inode) == CEPH_NOSNAP) stat->dev = sb->s_dev; else @@ -2519,6 +2523,8 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->nlink = 1 + 1 + ci->i_subdirs; } + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; stat->result_mask = request_mask & valid_mask; return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 9c8dc8a55e7e..cb51c7e9c8e2 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -7,6 +7,7 @@ #include "super.h" #include "mds_client.h" +#include <linux/filelock.h> #include <linux/ceph/pagelist.h> static u64 lock_secret; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 26a0a8b9975e..27a245d959c0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -806,6 +806,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) + return ERR_PTR(-EIO); + if (mds >= mdsc->mdsmap->possible_max_rank) return ERR_PTR(-EINVAL); @@ -1478,6 +1481,9 @@ static int __open_session(struct ceph_mds_client *mdsc, int mstate; int mds = session->s_mds; + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) + return -EIO; + /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); dout("open_session to mds%d (%s)\n", mds, @@ -2860,6 +2866,11 @@ static void __do_request(struct ceph_mds_client *mdsc, return; } + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) { + dout("do_request metadata corrupted\n"); + err = -EIO; + goto finish; + } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { dout("do_request timed out\n"); @@ -3245,6 +3256,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) u64 tid; int err, result; int mds = session->s_mds; + bool close_sessions = false; if (msg->front.iov_len < sizeof(*head)) { pr_err("mdsc_handle_reply got corrupt (short) reply\n"); @@ -3351,10 +3363,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) realm = NULL; if (rinfo->snapblob_len) { down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, rinfo->snapblob, + err = ceph_update_snap_trace(mdsc, rinfo->snapblob, rinfo->snapblob + rinfo->snapblob_len, le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, &realm); + if (err) { + up_write(&mdsc->snap_rwsem); + close_sessions = true; + if (err == -EIO) + ceph_msg_dump(msg); + goto out_err; + } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -3412,6 +3431,10 @@ out_err: req->r_end_latency, err); out: ceph_mdsc_put_request(req); + + /* Defer closing the sessions after s_mutex lock being released */ + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); return; } @@ -3662,6 +3685,12 @@ static void handle_session(struct ceph_mds_session *session, break; case CEPH_SESSION_FLUSHMSG: + /* flush cap releases */ + spin_lock(&session->s_cap_lock); + if (session->s_num_cap_releases) + ceph_flush_cap_releases(mdsc, session); + spin_unlock(&session->s_cap_lock); + send_flushmsg_ack(mdsc, session, seq); break; @@ -5011,7 +5040,7 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) } /* - * called after sb is ro. + * called after sb is ro or when metadata corrupted. */ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { @@ -5301,7 +5330,8 @@ static void mds_peer_reset(struct ceph_connection *con) struct ceph_mds_client *mdsc = s->s_mdsc; pr_warn("mds%d closed our session\n", s->s_mds); - send_mds_reconnect(mdsc, s); + if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO) + send_mds_reconnect(mdsc, s); } static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index e4151852184e..87007203f130 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> #include <linux/sort.h> #include <linux/slab.h> #include <linux/iversion.h> @@ -766,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm; struct ceph_snap_realm *first_realm = NULL; struct ceph_snap_realm *realm_to_rebuild = NULL; + struct ceph_client *client = mdsc->fsc->client; int rebuild_snapcs; int err = -ENOMEM; + int ret; LIST_HEAD(dirty_realms); lockdep_assert_held_write(&mdsc->snap_rwsem); @@ -884,6 +887,27 @@ fail: if (first_realm) ceph_put_snap_realm(mdsc, first_realm); pr_err("%s error %d\n", __func__, err); + + /* + * When receiving a corrupted snap trace we don't know what + * exactly has happened in MDS side. And we shouldn't continue + * writing to OSD, which may corrupt the snapshot contents. + * + * Just try to blocklist this kclient and then this kclient + * must be remounted to continue after the corrupted metadata + * fixed in the MDS side. + */ + WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO); + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); + if (ret) + pr_err("%s failed to blocklist %s: %d\n", __func__, + ceph_pr_addr(&client->msgr.inst.addr), ret); + + WARN(1, "%s: %s%sdo remount to continue%s", + __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr), + ret ? "" : " was blocklisted, ", + err == -EIO ? " after corrupted snaptrace is fixed" : ""); + return err; } @@ -984,6 +1008,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, __le64 *split_inos = NULL, *split_realms = NULL; int i; int locked_rwsem = 0; + bool close_sessions = false; /* decode */ if (msg->front.iov_len < sizeof(*h)) @@ -1092,8 +1117,12 @@ skip_inode: * update using the provided snap trace. if we are deleting a * snap, we can avoid queueing cap_snaps. */ - ceph_update_snap_trace(mdsc, p, e, - op == CEPH_SNAP_OP_DESTROY, NULL); + if (ceph_update_snap_trace(mdsc, p, e, + op == CEPH_SNAP_OP_DESTROY, + NULL)) { + close_sessions = true; + goto bad; + } if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ @@ -1112,6 +1141,9 @@ bad: out: if (locked_rwsem) up_write(&mdsc->snap_rwsem); + + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); return; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0ed3be75bb9a..6ecca2c6d137 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -100,6 +100,17 @@ struct ceph_mount_options { char *mon_addr; }; +/* mount state */ +enum { + CEPH_MOUNT_MOUNTING, + CEPH_MOUNT_MOUNTED, + CEPH_MOUNT_UNMOUNTING, + CEPH_MOUNT_UNMOUNTED, + CEPH_MOUNT_SHUTDOWN, + CEPH_MOUNT_RECOVER, + CEPH_MOUNT_FENCE_IO, +}; + #define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 struct ceph_fs_client { @@ -1039,12 +1050,12 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) { return __ceph_do_getattr(inode, NULL, mask, force); } -extern int ceph_permission(struct user_namespace *mnt_userns, +extern int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); extern int __ceph_setattr(struct inode *inode, struct iattr *attr); -extern int ceph_setattr(struct user_namespace *mnt_userns, +extern int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -extern int ceph_getattr(struct user_namespace *mnt_userns, +extern int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); void ceph_inode_shutdown(struct inode *inode); @@ -1117,7 +1128,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); #ifdef CONFIG_CEPH_FS_POSIX_ACL struct posix_acl *ceph_get_acl(struct inode *, int, bool); -int ceph_set_acl(struct user_namespace *mnt_userns, +int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f31350cda960..f65b07cc33a2 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1285,7 +1285,7 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler, } static int ceph_set_xattr_handler(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 3b7e3b9e4fd2..4c0d53bf931a 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -18,40 +18,38 @@ config CIFS select DNS_RESOLVER select ASN1 select OID_REGISTRY + select NETFS_SUPPORT help - This is the client VFS module for the SMB3 family of NAS protocols, - (including support for the most recent, most secure dialect SMB3.1.1) - as well as for earlier dialects such as SMB2.1, SMB2 and the older - Common Internet File System (CIFS) protocol. CIFS was the successor - to the original dialect, the Server Message Block (SMB) protocol, the - native file sharing mechanism for most early PC operating systems. - - The SMB3 protocol is supported by most modern operating systems - and NAS appliances (e.g. Samba, Windows 10, Windows Server 2016, - MacOS) and even in the cloud (e.g. Microsoft Azure). - The older CIFS protocol was included in Windows NT4, 2000 and XP (and - later) as well by Samba (which provides excellent CIFS and SMB3 - server support for Linux and many other operating systems). Use of - dialects older than SMB2.1 is often discouraged on public networks. + This is the client VFS module for the SMB3 family of network file + protocols (including the most recent, most secure dialect SMB3.1.1). + This module also includes support for earlier dialects such as + SMB2.1, SMB2 and even the old Common Internet File System (CIFS) + protocol. CIFS was the successor to the original network filesystem + protocol, Server Message Block (SMB ie SMB1), the native file sharing + mechanism for most early PC operating systems. + + The SMB3.1.1 protocol is supported by most modern operating systems + and NAS appliances (e.g. Samba, Windows 11, Windows Server 2022, + MacOS) and even in the cloud (e.g. Microsoft Azure) and also by the + Linux kernel server, ksmbd. Support for the older CIFS protocol was + included in Windows NT4, 2000 and XP (and later). Use of dialects + older than SMB2.1 is often discouraged on public networks. This module also provides limited support for OS/2 and Windows ME and similar very old servers. - This module provides an advanced network file system client - for mounting to SMB3 (and CIFS) compliant servers. It includes - support for DFS (hierarchical name space), secure per-user - session establishment via Kerberos or NTLM or NTLMv2, RDMA - (smbdirect), advanced security features, per-share encryption, - directory leases, safe distributed caching (oplock), optional packet - signing, Unicode and other internationalization improvements. + This module provides an advanced network file system client for + mounting to SMB3 (and CIFS) compliant servers. It includes support + for DFS (hierarchical name space), secure per-user session + establishment via Kerberos or NTLMv2, RDMA (smbdirect), advanced + security features, per-share encryption, packet-signing, snapshots, + directory leases, safe distributed caching (leases), multichannel, + Unicode and other internationalization improvements. In general, the default dialects, SMB3 and later, enable better performance, security and features, than would be possible with CIFS. - Note that when mounting to Samba, due to the CIFS POSIX extensions, - CIFS mounts can provide slightly better POSIX compatibility - than SMB3 mounts. SMB2/SMB3 mount options are also - slightly simpler (compared to CIFS) due to protocol improvements. - If you need to mount to Samba, Azure, Macs or Windows from this machine, say Y. + If you need to mount to Samba, Azure, ksmbd, Macs or Windows from this + machine, say Y. config CIFS_STATS2 bool "Extended statistics" @@ -111,12 +109,12 @@ config CIFS_POSIX depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR help Enabling this option will cause the cifs client to attempt to - negotiate a newer dialect with servers, such as Samba 3.0.5 - or later, that optionally can handle more POSIX like (rather - than Windows like) file behavior. It also enables - support for POSIX ACLs (getfacl and setfacl) to servers - (such as Samba 3.10 and later) which can negotiate - CIFS POSIX ACL support. If unsure, say N. + negotiate a feature of the older cifs dialect with servers, such as + Samba 3.0.5 or later, that optionally can handle more POSIX like + (rather than Windows like) file behavior. It also enables support + for POSIX ACLs (getfacl and setfacl) to servers (such as Samba 3.10 + and later) which can negotiate CIFS POSIX ACL support. This config + option is not needed when mounting with SMB3.1.1. If unsure, say N. config CIFS_DEBUG bool "Enable CIFS debugging routines" @@ -178,6 +176,8 @@ config CIFS_NFSD_EXPORT help Allows NFS server to export a CIFS mounted share (nfsd over cifs) +if CIFS + config CIFS_SMB_DIRECT bool "SMB Direct support" depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y @@ -201,3 +201,5 @@ config CIFS_ROOT Enables root file system support over SMB protocol. Most people say N here. + +endif diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index 60399081046a..75d5e06306ea 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -14,6 +14,7 @@ static struct cached_fid *init_cached_dir(const char *path); static void free_cached_dir(struct cached_fid *cfid); +static void smb2_close_cached_fid(struct kref *ref); static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, const char *path, @@ -181,12 +182,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - oparms.tcon = tcon; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE); - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.fid = pfid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE), + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .fid = pfid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -220,8 +222,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, } goto oshr_free; } - - atomic_inc(&tcon->num_remote_opens); + cfid->tcon = tcon; + cfid->is_open = true; o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; oparms.fid->persistent_fid = o_rsp->PersistentFileId; @@ -233,12 +235,12 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) goto oshr_free; - smb2_parse_contexts(server, o_rsp, &oparms.fid->epoch, oparms.fid->lease_key, &oplock, NULL, NULL); - + if (!(oplock & SMB2_LEASE_READ_CACHING_HE)) + goto oshr_free; qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) goto oshr_free; @@ -259,9 +261,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, } } cfid->dentry = dentry; - cfid->tcon = tcon; cfid->time = jiffies; - cfid->is_open = true; cfid->has_lease = true; oshr_free: @@ -271,7 +271,7 @@ oshr_free: free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); spin_lock(&cfids->cfid_list_lock); - if (!cfid->has_lease) { + if (rc && !cfid->has_lease) { if (cfid->on_list) { list_del(&cfid->entry); cfid->on_list = false; @@ -280,13 +280,27 @@ oshr_free: rc = -ENOENT; } spin_unlock(&cfids->cfid_list_lock); + if (!rc && !cfid->has_lease) { + /* + * We are guaranteed to have two references at this point. + * One for the caller and one for a potential lease. + * Release the Lease-ref so that the directory will be closed + * when the caller closes the cached handle. + */ + kref_put(&cfid->refcount, smb2_close_cached_fid); + } if (rc) { + if (cfid->is_open) + SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, + cfid->fid.volatile_fid); free_cached_dir(cfid); cfid = NULL; } - if (rc == 0) + if (rc == 0) { *ret_cfid = cfid; + atomic_inc(&tcon->num_remote_opens); + } return rc; } @@ -335,6 +349,7 @@ smb2_close_cached_fid(struct kref *ref) if (cfid->is_open) { SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, cfid->fid.volatile_fid); + atomic_dec(&cfid->tcon->num_remote_opens); } free_cached_dir(cfid); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 56b23def4c95..1911f7016fa1 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/ctype.h> +#include <linux/kstrtox.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/uaccess.h> @@ -455,8 +456,10 @@ skip_rdma: spin_lock(&ses->iface_lock); if (ses->iface_count) - seq_printf(m, "\n\n\tServer interfaces: %zu", - ses->iface_count); + seq_printf(m, "\n\n\tServer interfaces: %zu" + "\tLast updated: %lu seconds ago", + ses->iface_count, + (jiffies - ses->iface_last_update) / HZ); j = 0; list_for_each_entry(iface, &ses->iface_list, iface_head) { @@ -787,7 +790,7 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, rc = get_user(c[0], buffer); if (rc) return rc; - if (strtobool(c, &bv) == 0) + if (kstrtobool(c, &bv) == 0) cifsFYI = bv; else if ((c[0] > '1') && (c[0] <= '9')) cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */ @@ -947,7 +950,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, if (count < 3) { /* single char or single char followed by null */ - if (strtobool(flags_string, &bv) == 0) { + if (kstrtobool(flags_string, &bv) == 0) { global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF; return count; } else if (!isdigit(flags_string[0])) { diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index 7f102ffeb675..e4d751b0c812 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -24,7 +24,7 @@ struct cifs_spnego_msg { uint32_t flags; uint32_t sesskey_len; uint32_t secblob_len; - uint8_t data[1]; + uint8_t data[]; }; #ifdef __KERNEL__ diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index bbf58c2439da..f5b6df82e857 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1428,14 +1428,15 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, tcon = tlink_tcon(tlink); xid = get_xid(); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = READ_CONTROL; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = READ_CONTROL, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (!rc) { @@ -1494,14 +1495,15 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, else access_flags = WRITE_DAC; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = access_flags; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = access_flags, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) { @@ -1674,7 +1676,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, return rc; } -struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *cifs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { #if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) @@ -1738,7 +1740,7 @@ out: #endif } -int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int cifs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { #if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index cbc18b4a9cb2..357bd27a7fd1 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -24,12 +24,156 @@ #include "../smbfs_common/arc4.h" #include <crypto/aead.h> +/* + * Hash data from a BVEC-type iterator. + */ +static int cifs_shash_bvec(const struct iov_iter *iter, ssize_t maxsize, + struct shash_desc *shash) +{ + const struct bio_vec *bv = iter->bvec; + unsigned long start = iter->iov_offset; + unsigned int i; + void *p; + int ret; + + for (i = 0; i < iter->nr_segs; i++) { + size_t off, len; + + len = bv[i].bv_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + off = bv[i].bv_offset + start; + + p = kmap_local_page(bv[i].bv_page); + ret = crypto_shash_update(shash, p + off, len); + kunmap_local(p); + if (ret < 0) + return ret; + + maxsize -= len; + if (maxsize <= 0) + break; + start = 0; + } + + return 0; +} + +/* + * Hash data from a KVEC-type iterator. + */ +static int cifs_shash_kvec(const struct iov_iter *iter, ssize_t maxsize, + struct shash_desc *shash) +{ + const struct kvec *kv = iter->kvec; + unsigned long start = iter->iov_offset; + unsigned int i; + int ret; + + for (i = 0; i < iter->nr_segs; i++) { + size_t len; + + len = kv[i].iov_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + ret = crypto_shash_update(shash, kv[i].iov_base + start, len); + if (ret < 0) + return ret; + maxsize -= len; + + if (maxsize <= 0) + break; + start = 0; + } + + return 0; +} + +/* + * Hash data from an XARRAY-type iterator. + */ +static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize, + struct shash_desc *shash) +{ + struct folio *folios[16], *folio; + unsigned int nr, i, j, npages; + loff_t start = iter->xarray_start + iter->iov_offset; + pgoff_t last, index = start / PAGE_SIZE; + ssize_t ret = 0; + size_t len, offset, foffset; + void *p; + + if (maxsize == 0) + return 0; + + last = (start + maxsize - 1) / PAGE_SIZE; + do { + nr = xa_extract(iter->xarray, (void **)folios, index, last, + ARRAY_SIZE(folios), XA_PRESENT); + if (nr == 0) + return -EIO; + + for (i = 0; i < nr; i++) { + folio = folios[i]; + npages = folio_nr_pages(folio); + foffset = start - folio_pos(folio); + offset = foffset % PAGE_SIZE; + for (j = foffset / PAGE_SIZE; j < npages; j++) { + len = min_t(size_t, maxsize, PAGE_SIZE - offset); + p = kmap_local_page(folio_page(folio, j)); + ret = crypto_shash_update(shash, p, len); + kunmap_local(p); + if (ret < 0) + return ret; + maxsize -= len; + if (maxsize <= 0) + return 0; + start += len; + offset = 0; + index++; + } + } + } while (nr == ARRAY_SIZE(folios)); + return 0; +} + +/* + * Pass the data from an iterator into a hash. + */ +static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize, + struct shash_desc *shash) +{ + if (maxsize == 0) + return 0; + + switch (iov_iter_type(iter)) { + case ITER_BVEC: + return cifs_shash_bvec(iter, maxsize, shash); + case ITER_KVEC: + return cifs_shash_kvec(iter, maxsize, shash); + case ITER_XARRAY: + return cifs_shash_xarray(iter, maxsize, shash); + default: + pr_err("cifs_shash_iter(%u) unsupported\n", iov_iter_type(iter)); + WARN_ON_ONCE(1); + return -EIO; + } +} + int __cifs_calc_signature(struct smb_rqst *rqst, - struct TCP_Server_Info *server, char *signature, - struct shash_desc *shash) + struct TCP_Server_Info *server, char *signature, + struct shash_desc *shash) { int i; - int rc; + ssize_t rc; struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; @@ -61,25 +205,9 @@ int __cifs_calc_signature(struct smb_rqst *rqst, } } - /* now hash over the rq_pages array */ - for (i = 0; i < rqst->rq_npages; i++) { - void *kaddr; - unsigned int len, offset; - - rqst_page_get_length(rqst, i, &len, &offset); - - kaddr = (char *) kmap(rqst->rq_pages[i]) + offset; - - rc = crypto_shash_update(shash, kaddr, len); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with payload\n", - __func__); - kunmap(rqst->rq_pages[i]); - return rc; - } - - kunmap(rqst->rq_pages[i]); - } + rc = cifs_shash_iter(&rqst->rq_iter, iov_iter_count(&rqst->rq_iter), shash); + if (rc < 0) + return rc; rc = crypto_shash_final(shash, signature); if (rc) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 10e00c624922..cbcf210d56e4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/mount.h> #include <linux/slab.h> #include <linux/init.h> @@ -345,7 +346,7 @@ static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len) return -EOPNOTSUPP; } -static int cifs_permission(struct user_namespace *mnt_userns, +static int cifs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct cifs_sb_info *cifs_sb; @@ -361,7 +362,7 @@ static int cifs_permission(struct user_namespace *mnt_userns, on the client (above and beyond ACL on servers) for servers which do not support setting and viewing mode bits, so allowing client to check permissions is useful */ - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } static struct kmem_cache *cifs_inode_cachep; @@ -1358,7 +1359,7 @@ const struct file_operations cifs_file_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = cifs_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1378,7 +1379,7 @@ const struct file_operations cifs_file_strict_ops = { .fsync = cifs_strict_fsync, .flush = cifs_flush, .mmap = cifs_file_strict_mmap, - .splice_read = generic_file_splice_read, + .splice_read = cifs_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1398,7 +1399,7 @@ const struct file_operations cifs_file_direct_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, @@ -1416,7 +1417,7 @@ const struct file_operations cifs_file_nobrl_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = cifs_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1434,7 +1435,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .fsync = cifs_strict_fsync, .flush = cifs_flush, .mmap = cifs_file_strict_mmap, - .splice_read = generic_file_splice_read, + .splice_read = cifs_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1452,7 +1453,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 63a0ac2b9355..71fe0a0a7992 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -49,7 +49,7 @@ extern void cifs_sb_deactive(struct super_block *sb); /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); -extern int cifs_create(struct user_namespace *, struct inode *, +extern int cifs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool excl); extern int cifs_atomic_open(struct inode *, struct dentry *, struct file *, unsigned, umode_t); @@ -57,12 +57,12 @@ extern struct dentry *cifs_lookup(struct inode *, struct dentry *, unsigned int); extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); -extern int cifs_mknod(struct user_namespace *, struct inode *, struct dentry *, +extern int cifs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); -extern int cifs_mkdir(struct user_namespace *, struct inode *, struct dentry *, +extern int cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); -extern int cifs_rename2(struct user_namespace *, struct inode *, +extern int cifs_rename2(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern int cifs_revalidate_file_attr(struct file *filp); @@ -72,9 +72,9 @@ extern int cifs_revalidate_dentry(struct dentry *); extern int cifs_invalidate_mapping(struct inode *inode); extern int cifs_revalidate_mapping(struct inode *inode); extern int cifs_zap_mapping(struct inode *inode); -extern int cifs_getattr(struct user_namespace *, const struct path *, +extern int cifs_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int cifs_setattr(struct user_namespace *, struct dentry *, +extern int cifs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern int cifs_fiemap(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); @@ -100,6 +100,9 @@ extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to); extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from); +extern ssize_t cifs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); extern int cifs_flock(struct file *pfile, int cmd, struct file_lock *plock); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); @@ -110,6 +113,9 @@ extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma); extern const struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); extern int cifs_readdir(struct file *file, struct dir_context *ctx); +extern void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len); +extern void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len); +extern void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int len); /* Functions related to dir entries */ extern const struct dentry_operations cifs_dentry_ops; @@ -124,7 +130,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path); /* Functions related to symlinks */ extern const char *cifs_get_link(struct dentry *, struct inode *, struct delayed_call *); -extern int cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, +extern int cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, const char *symname); #ifdef CONFIG_CIFS_XATTR @@ -154,5 +160,5 @@ extern const struct export_operations cifs_export_ops; /* when changing internal version - update following two lines at same time */ #define SMB3_PRODUCT_BUILD 41 -#define CIFS_VERSION "2.41" +#define CIFS_VERSION "2.42" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cfdd5bf701a1..a99883f16d94 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -26,6 +26,7 @@ #include <uapi/linux/cifs/cifs_mount.h> #include "../smbfs_common/smb2pdu.h" #include "smb2pdu.h" +#include <linux/filelock.h> #define SMB_PATH_MAX 260 #define CIFS_PORT 445 @@ -77,10 +78,6 @@ #define SMB_ECHO_INTERVAL_MAX 600 #define SMB_ECHO_INTERVAL_DEFAULT 60 -/* dns resolution intervals in seconds */ -#define SMB_DNS_RESOLVE_INTERVAL_MIN 120 -#define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600 - /* smb multichannel query server interfaces interval in seconds */ #define SMB_INTERFACE_POLL_INTERVAL 600 @@ -216,11 +213,9 @@ static inline void cifs_free_open_info(struct cifs_open_info_data *data) struct smb_rqst { struct kvec *rq_iov; /* array of kvecs */ unsigned int rq_nvec; /* number of kvecs in array */ - struct page **rq_pages; /* pointer to array of page ptrs */ - unsigned int rq_offset; /* the offset to the 1st page */ - unsigned int rq_npages; /* number pages in array */ - unsigned int rq_pagesz; /* page size to use */ - unsigned int rq_tailsz; /* length of last page */ + size_t rq_iter_size; /* Amount of data in ->rq_iter */ + struct iov_iter rq_iter; /* Data iterator */ + struct xarray rq_buffer; /* Page buffer for encryption */ }; struct mid_q_entry; @@ -691,7 +686,6 @@ struct TCP_Server_Info { /* point to the SMBD connection if RDMA is used instead of socket */ struct smbd_connection *smbd_conn; struct delayed_work echo; /* echo ping workqueue job */ - struct delayed_work resolve; /* dns resolution workqueue job */ char *smallbuf; /* pointer to current "small" buffer */ char *bigbuf; /* pointer to current "big" buffer */ /* Total size of this PDU. Only valid from cifs_demultiplex_thread */ @@ -1426,10 +1420,11 @@ struct cifs_aio_ctx { struct cifsFileInfo *cfile; struct bio_vec *bv; loff_t pos; - unsigned int npages; + unsigned int nr_pinned_pages; ssize_t rc; unsigned int len; unsigned int total_len; + unsigned int bv_need_unpin; /* If ->bv[] needs unpinning */ bool should_dirty; /* * Indicates if this aio_ctx is for direct_io, @@ -1447,28 +1442,18 @@ struct cifs_readdata { struct address_space *mapping; struct cifs_aio_ctx *ctx; __u64 offset; + ssize_t got_bytes; unsigned int bytes; - unsigned int got_bytes; pid_t pid; int result; struct work_struct work; - int (*read_into_pages)(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, - unsigned int len); - int (*copy_into_pages)(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, - struct iov_iter *iter); + struct iov_iter iter; struct kvec iov[2]; struct TCP_Server_Info *server; #ifdef CONFIG_CIFS_SMB_DIRECT struct smbd_mr *mr; #endif - unsigned int pagesz; - unsigned int page_offset; - unsigned int tailsz; struct cifs_credits credits; - unsigned int nr_pages; - struct page **pages; }; /* asynchronous write support */ @@ -1480,6 +1465,8 @@ struct cifs_writedata { struct work_struct work; struct cifsFileInfo *cfile; struct cifs_aio_ctx *ctx; + struct iov_iter iter; + struct bio_vec *bv; __u64 offset; pid_t pid; unsigned int bytes; @@ -1488,12 +1475,7 @@ struct cifs_writedata { #ifdef CONFIG_CIFS_SMB_DIRECT struct smbd_mr *mr; #endif - unsigned int pagesz; - unsigned int page_offset; - unsigned int tailsz; struct cifs_credits credits; - unsigned int nr_pages; - struct page **pages; }; /* @@ -2153,15 +2135,21 @@ static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const dst->FileNameLength = src->FileNameLength; } -static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, - int num_rqst, - const u8 *sig) +static inline int cifs_get_num_sgs(const struct smb_rqst *rqst, + int num_rqst, + const u8 *sig) { unsigned int len, skip; unsigned int nents = 0; unsigned long addr; int i, j; + /* + * The first rqst has a transform header where the first 20 bytes are + * not part of the encrypted blob. + */ + skip = 20; + /* Assumes the first rqst has a transform header as the first iov. * I.e. * rqst[0].rq_iov[0] is transform header @@ -2169,14 +2157,22 @@ static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, * rqst[1+].rq_iov[0+] data to be encrypted/decrypted */ for (i = 0; i < num_rqst; i++) { - /* - * The first rqst has a transform header where the - * first 20 bytes are not part of the encrypted blob. + /* We really don't want a mixture of pinned and unpinned pages + * in the sglist. It's hard to keep track of which is what. + * Instead, we convert to a BVEC-type iterator higher up. */ + if (WARN_ON_ONCE(user_backed_iter(&rqst[i].rq_iter))) + return -EIO; + + /* We also don't want to have any extra refs or pins to clean + * up in the sglist. + */ + if (WARN_ON_ONCE(iov_iter_extract_will_pin(&rqst[i].rq_iter))) + return -EIO; + for (j = 0; j < rqst[i].rq_nvec; j++) { struct kvec *iov = &rqst[i].rq_iov[j]; - skip = (i == 0) && (j == 0) ? 20 : 0; addr = (unsigned long)iov->iov_base + skip; if (unlikely(is_vmalloc_addr((void *)addr))) { len = iov->iov_len - skip; @@ -2185,8 +2181,9 @@ static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, } else { nents++; } + skip = 0; } - nents += rqst[i].rq_npages; + nents += iov_iter_npages(&rqst[i].rq_iter, INT_MAX); } nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE); return nents; @@ -2195,9 +2192,9 @@ static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, /* We can not use the normal sg_set_buf() as we will sometimes pass a * stack object as buf. */ -static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg, - const void *buf, - unsigned int buflen) +static inline void cifs_sg_set_buf(struct sg_table *sgtable, + const void *buf, + unsigned int buflen) { unsigned long addr = (unsigned long)buf; unsigned int off = offset_in_page(addr); @@ -2207,16 +2204,17 @@ static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg, do { unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off); - sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off); + sg_set_page(&sgtable->sgl[sgtable->nents++], + vmalloc_to_page((void *)addr), len, off); off = 0; addr += PAGE_SIZE; buflen -= len; } while (buflen); } else { - sg_set_page(sg++, virt_to_page(addr), buflen, off); + sg_set_page(&sgtable->sgl[sgtable->nents++], + virt_to_page(addr), buflen, off); } - return sg; } #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 623caece2b10..445e3eaebcc1 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -562,7 +562,7 @@ typedef union smb_com_session_setup_andx { __u32 Reserved; __le32 Capabilities; /* see below */ __le16 ByteCount; - unsigned char SecurityBlob[1]; /* followed by */ + unsigned char SecurityBlob[]; /* followed by */ /* STRING NativeOS */ /* STRING NativeLanMan */ } __attribute__((packed)) req; /* NTLM request format (with @@ -582,7 +582,7 @@ typedef union smb_com_session_setup_andx { __u32 Reserved; /* see below */ __le32 Capabilities; __le16 ByteCount; - unsigned char CaseInsensitivePassword[1]; /* followed by: */ + unsigned char CaseInsensitivePassword[]; /* followed by: */ /* unsigned char * CaseSensitivePassword; */ /* STRING AccountName */ /* STRING PrimaryDomain */ @@ -599,7 +599,7 @@ typedef union smb_com_session_setup_andx { __le16 Action; /* see below */ __le16 SecurityBlobLength; __u16 ByteCount; - unsigned char SecurityBlob[1]; /* followed by */ + unsigned char SecurityBlob[]; /* followed by */ /* unsigned char * NativeOS; */ /* unsigned char * NativeLanMan; */ /* unsigned char * PrimaryDomain; */ @@ -618,7 +618,7 @@ typedef union smb_com_session_setup_andx { __le16 PasswordLength; __u32 Reserved; /* encrypt key len and offset */ __le16 ByteCount; - unsigned char AccountPassword[1]; /* followed by */ + unsigned char AccountPassword[]; /* followed by */ /* STRING AccountName */ /* STRING PrimaryDomain */ /* STRING NativeOS */ @@ -632,7 +632,7 @@ typedef union smb_com_session_setup_andx { __le16 AndXOffset; __le16 Action; /* see below */ __u16 ByteCount; - unsigned char NativeOS[1]; /* followed by */ + unsigned char NativeOS[]; /* followed by */ /* unsigned char * NativeLanMan; */ /* unsigned char * PrimaryDomain; */ } __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response */ @@ -693,7 +693,7 @@ typedef struct smb_com_tconx_req { __le16 Flags; /* see below */ __le16 PasswordLength; __le16 ByteCount; - unsigned char Password[1]; /* followed by */ + unsigned char Password[]; /* followed by */ /* STRING Path *//* \\server\share name */ /* STRING Service */ } __attribute__((packed)) TCONX_REQ; @@ -705,7 +705,7 @@ typedef struct smb_com_tconx_rsp { __le16 AndXOffset; __le16 OptionalSupport; /* see below */ __u16 ByteCount; - unsigned char Service[1]; /* always ASCII, not Unicode */ + unsigned char Service[]; /* always ASCII, not Unicode */ /* STRING NativeFileSystem */ } __attribute__((packed)) TCONX_RSP; @@ -718,7 +718,7 @@ typedef struct smb_com_tconx_rsp_ext { __le32 MaximalShareAccessRights; __le32 GuestMaximalShareAccessRights; __u16 ByteCount; - unsigned char Service[1]; /* always ASCII, not Unicode */ + unsigned char Service[]; /* always ASCII, not Unicode */ /* STRING NativeFileSystem */ } __attribute__((packed)) TCONX_RSP_EXT; @@ -755,14 +755,14 @@ typedef struct smb_com_echo_req { struct smb_hdr hdr; __le16 EchoCount; __le16 ByteCount; - char Data[1]; + char Data[]; } __attribute__((packed)) ECHO_REQ; typedef struct smb_com_echo_rsp { struct smb_hdr hdr; __le16 SequenceNumber; __le16 ByteCount; - char Data[1]; + char Data[]; } __attribute__((packed)) ECHO_RSP; typedef struct smb_com_logoff_andx_req { @@ -862,7 +862,7 @@ typedef struct smb_com_open_req { /* also handles create */ __le32 ImpersonationLevel; __u8 SecurityFlags; __le16 ByteCount; - char fileName[1]; + char fileName[]; } __attribute__((packed)) OPEN_REQ; /* open response: oplock levels */ @@ -937,7 +937,7 @@ typedef struct smb_com_openx_req { __le32 Timeout; __le32 Reserved; __le16 ByteCount; /* file name follows */ - char fileName[1]; + char fileName[]; } __attribute__((packed)) OPENX_REQ; typedef struct smb_com_openx_rsp { @@ -1085,7 +1085,7 @@ typedef struct smb_com_lock_req { __le16 NumberOfUnlocks; __le16 NumberOfLocks; __le16 ByteCount; - LOCKING_ANDX_RANGE Locks[1]; + LOCKING_ANDX_RANGE Locks[]; } __attribute__((packed)) LOCK_REQ; /* lock type */ @@ -1114,7 +1114,7 @@ typedef struct smb_com_rename_req { __le16 SearchAttributes; /* target file attributes */ __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII or Unicode */ - unsigned char OldFileName[1]; + unsigned char OldFileName[]; /* followed by __u8 BufferFormat2 */ /* followed by NewFileName */ } __attribute__((packed)) RENAME_REQ; @@ -1134,7 +1134,7 @@ typedef struct smb_com_copy_req { __le16 Flags; __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII or Unicode */ - unsigned char OldFileName[1]; + unsigned char OldFileName[]; /* followed by __u8 BufferFormat2 */ /* followed by NewFileName string */ } __attribute__((packed)) COPY_REQ; @@ -1144,7 +1144,7 @@ typedef struct smb_com_copy_rsp { __le16 CopyCount; /* number of files copied */ __u16 ByteCount; /* may be zero */ __u8 BufferFormat; /* 0x04 - only present if errored file follows */ - unsigned char ErrorFileName[1]; /* only present if error in copy */ + unsigned char ErrorFileName[]; /* only present if error in copy */ } __attribute__((packed)) COPY_RSP; #define CREATE_HARD_LINK 0x103 @@ -1158,7 +1158,7 @@ typedef struct smb_com_nt_rename_req { /* A5 - also used for create hardlink */ __le32 ClusterCount; __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII or Unicode */ - unsigned char OldFileName[1]; + unsigned char OldFileName[]; /* followed by __u8 BufferFormat2 */ /* followed by NewFileName */ } __attribute__((packed)) NT_RENAME_REQ; @@ -1173,7 +1173,7 @@ typedef struct smb_com_delete_file_req { __le16 SearchAttributes; __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ - unsigned char fileName[1]; + unsigned char fileName[]; } __attribute__((packed)) DELETE_FILE_REQ; typedef struct smb_com_delete_file_rsp { @@ -1185,7 +1185,7 @@ typedef struct smb_com_delete_directory_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ - unsigned char DirName[1]; + unsigned char DirName[]; } __attribute__((packed)) DELETE_DIRECTORY_REQ; typedef struct smb_com_delete_directory_rsp { @@ -1197,7 +1197,7 @@ typedef struct smb_com_create_directory_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ - unsigned char DirName[1]; + unsigned char DirName[]; } __attribute__((packed)) CREATE_DIRECTORY_REQ; typedef struct smb_com_create_directory_rsp { @@ -1209,7 +1209,7 @@ typedef struct smb_com_query_information_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; /* 1 + namelen + 1 */ __u8 BufferFormat; /* 4 = ASCII */ - unsigned char FileName[1]; + unsigned char FileName[]; } __attribute__((packed)) QUERY_INFORMATION_REQ; typedef struct smb_com_query_information_rsp { @@ -1229,7 +1229,7 @@ typedef struct smb_com_setattr_req { __le16 reserved[5]; /* must be zero */ __u16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ - unsigned char fileName[1]; + unsigned char fileName[]; } __attribute__((packed)) SETATTR_REQ; typedef struct smb_com_setattr_rsp { @@ -1311,7 +1311,7 @@ typedef struct smb_com_transaction_ioctl_req { __u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */ __le16 ByteCount; __u8 Pad[3]; - __u8 Data[1]; + __u8 Data[]; } __attribute__((packed)) TRANSACT_IOCTL_REQ; typedef struct smb_com_transaction_compr_ioctl_req { @@ -1430,7 +1430,7 @@ typedef struct smb_com_transaction_change_notify_req { __u8 Reserved2; __le16 ByteCount; /* __u8 Pad[3];*/ -/* __u8 Data[1];*/ +/* __u8 Data[];*/ } __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ; /* BB eventually change to use generic ntransact rsp struct @@ -1519,7 +1519,7 @@ struct cifs_quota_data { __u64 space_used; __u64 soft_limit; __u64 hard_limit; - char sid[1]; /* variable size? */ + char sid[]; /* variable size? */ } __attribute__((packed)); /* quota sub commands */ @@ -1671,7 +1671,7 @@ typedef struct smb_com_transaction2_qpi_req { __u8 Pad; __le16 InformationLevel; __u32 Reserved4; - char FileName[1]; + char FileName[]; } __attribute__((packed)) TRANSACTION2_QPI_REQ; typedef struct smb_com_transaction2_qpi_rsp { @@ -1704,7 +1704,7 @@ typedef struct smb_com_transaction2_spi_req { __u16 Pad1; __le16 InformationLevel; __u32 Reserved4; - char FileName[1]; + char FileName[]; } __attribute__((packed)) TRANSACTION2_SPI_REQ; typedef struct smb_com_transaction2_spi_rsp { @@ -1809,7 +1809,7 @@ typedef struct smb_com_transaction2_ffirst_req { __le16 SearchFlags; __le16 InformationLevel; __le32 SearchStorageType; - char FileName[1]; + char FileName[]; } __attribute__((packed)) TRANSACTION2_FFIRST_REQ; typedef struct smb_com_transaction2_ffirst_rsp { @@ -2020,7 +2020,7 @@ typedef struct smb_com_transaction2_get_dfs_refer_req { perhaps?) followed by one byte pad - doesn't seem to matter though */ __le16 MaxReferralLevel; - char RequestFileName[1]; + char RequestFileName[]; } __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_REQ; #define DFS_VERSION cpu_to_le16(0x0003) @@ -2049,7 +2049,7 @@ struct get_dfs_referral_rsp { __le16 PathConsumed; __le16 NumberOfReferrals; __le32 DFSFlags; - REFERRAL3 referrals[1]; /* array of level 3 dfs_referral structures */ + REFERRAL3 referrals[]; /* array of level 3 dfs_referral structures */ /* followed by the strings pointed to by the referral structures */ } __packed; @@ -2284,7 +2284,10 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */ __le32 Mode; __le32 AlignmentRequirement; __le32 FileNameLength; - char FileName[1]; + union { + char __pad; + DECLARE_FLEX_ARRAY(char, FileName); + }; } __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */ typedef struct { @@ -2322,7 +2325,7 @@ typedef struct { } __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */ typedef struct { - char LinkDest[1]; + DECLARE_FLEX_ARRAY(char, LinkDest); } __attribute__((packed)) FILE_UNIX_LINK_INFO; /* level 0x201 QPathInfo */ /* The following three structures are needed only for @@ -2371,7 +2374,7 @@ struct file_end_of_file_info { } __attribute__((packed)); /* size info, level 0x104 for set, 0x106 for query */ struct file_alt_name_info { - __u8 alt_name[1]; + DECLARE_FLEX_ARRAY(__u8, alt_name); } __attribute__((packed)); /* level 0x0108 */ struct file_stream_info { @@ -2480,7 +2483,10 @@ typedef struct { __le32 NextEntryOffset; __u32 ResumeKey; /* as with FileIndex - no need to convert */ FILE_UNIX_BASIC_INFO basic; - char FileName[1]; + union { + char __pad; + DECLARE_FLEX_ARRAY(char, FileName); + }; } __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */ typedef struct { @@ -2494,7 +2500,7 @@ typedef struct { __le64 AllocationSize; __le32 ExtFileAttributes; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __attribute__((packed)) FILE_DIRECTORY_INFO; /* level 0x101 FF resp data */ typedef struct { @@ -2509,7 +2515,7 @@ typedef struct { __le32 ExtFileAttributes; __le32 FileNameLength; __le32 EaSize; /* length of the xattrs */ - char FileName[1]; + char FileName[]; } __attribute__((packed)) FILE_FULL_DIRECTORY_INFO; /* level 0x102 rsp data */ typedef struct { @@ -2526,7 +2532,7 @@ typedef struct { __le32 EaSize; /* EA size */ __le32 Reserved; __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ - char FileName[1]; + char FileName[]; } __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */ typedef struct { @@ -2544,7 +2550,7 @@ typedef struct { __u8 ShortNameLength; __u8 Reserved; __u8 ShortName[24]; - char FileName[1]; + char FileName[]; } __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */ typedef struct { @@ -2559,7 +2565,7 @@ typedef struct { __le32 AllocationSize; __le16 Attributes; /* verify not u32 */ __u8 FileNameLength; - char FileName[1]; + char FileName[]; } __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */ @@ -2569,21 +2575,11 @@ struct win_dev { __le64 minor; } __attribute__((packed)); -struct gea { - unsigned char name_len; - char name[1]; -} __attribute__((packed)); - -struct gealist { - unsigned long list_len; - struct gea list[1]; -} __attribute__((packed)); - struct fea { unsigned char EA_flags; __u8 name_len; __le16 value_len; - char name[1]; + char name[]; /* optionally followed by value */ } __attribute__((packed)); /* flags for _FEA.fEA */ @@ -2591,7 +2587,7 @@ struct fea { struct fealist { __le32 list_len; - struct fea list[1]; + struct fea list; } __attribute__((packed)); /* used to hold an arbitrary blob of data */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1207b39686fb..b7a36ebd0f2f 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -225,9 +225,9 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *, u32); extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, const struct cifs_fid *, u32 *, u32); -extern struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, +extern struct posix_acl *cifs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type); -extern int cifs_set_acl(struct user_namespace *mnt_userns, +extern int cifs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); @@ -244,6 +244,9 @@ extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, unsigned int to_read); +int cifs_read_iter_from_socket(struct TCP_Server_Info *server, + struct iov_iter *iter, + unsigned int to_read); extern int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb); void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx); int cifs_mount_get_session(struct cifs_mount_ctx *mnt_ctx); @@ -581,10 +584,7 @@ int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid); int cifs_async_writev(struct cifs_writedata *wdata, void (*release)(struct kref *kref)); void cifs_writev_complete(struct work_struct *work); -struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, - work_func_t complete); -struct cifs_writedata *cifs_writedata_direct_alloc(struct page **pages, - work_func_t complete); +struct cifs_writedata *cifs_writedata_alloc(work_func_t complete); void cifs_writedata_release(struct kref *refcount); int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, @@ -601,13 +601,10 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, enum securityEnum); struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); void cifs_aio_ctx_release(struct kref *refcount); -int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); void cifs_free_hash(struct shash_desc **sdesc); -void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page, - unsigned int *len, unsigned int *offset); struct cifs_chan * cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server); int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 23f10e0d6e7e..a24e4ddf8043 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -15,6 +15,7 @@ /* want to reuse a stale file handle and only the caller knows the file info */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/kernel.h> #include <linux/vfs.h> #include <linux/slab.h> @@ -24,6 +25,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/uaccess.h> #include "cifspdu.h" +#include "cifsfs.h" #include "cifsglob.h" #include "cifsacl.h" #include "cifsproto.h" @@ -1294,11 +1296,8 @@ cifs_readv_callback(struct mid_q_entry *mid) struct TCP_Server_Info *server = tcon->ses->server; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, - .rq_pages = rdata->pages, - .rq_offset = rdata->page_offset, - .rq_npages = rdata->nr_pages, - .rq_pagesz = rdata->pagesz, - .rq_tailsz = rdata->tailsz }; + .rq_iter_size = iov_iter_count(&rdata->iter), + .rq_iter = rdata->iter }; struct cifs_credits credits = { .value = 1, .instance = 0 }; cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n", @@ -1737,11 +1736,8 @@ cifs_async_writev(struct cifs_writedata *wdata, rqst.rq_iov = iov; rqst.rq_nvec = 2; - rqst.rq_pages = wdata->pages; - rqst.rq_offset = wdata->page_offset; - rqst.rq_npages = wdata->nr_pages; - rqst.rq_pagesz = wdata->pagesz; - rqst.rq_tailsz = wdata->tailsz; + rqst.rq_iter = wdata->iter; + rqst.rq_iter_size = iov_iter_count(&wdata->iter); cifs_dbg(FYI, "async write at %llu %u bytes\n", wdata->offset, wdata->bytes); @@ -5372,14 +5368,15 @@ CIFSSMBSetPathInfoFB(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; int rc; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = fileName; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = fileName, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) @@ -5786,7 +5783,7 @@ QAllEAsRetry: /* account for ea list len */ list_len -= 4; - temp_fea = ea_response_data->list; + temp_fea = &ea_response_data->list; temp_ptr = (char *)temp_fea; while (list_len > 0) { unsigned int name_len; @@ -5901,7 +5898,7 @@ SetEARetry: else name_len = strnlen(ea_name, 255); - count = sizeof(*parm_data) + ea_value_len + name_len; + count = sizeof(*parm_data) + 1 + ea_value_len + name_len; pSMB->MaxParameterCount = cpu_to_le16(2); /* BB find max SMB PDU from sess */ pSMB->MaxDataCount = cpu_to_le16(1000); @@ -5925,14 +5922,14 @@ SetEARetry: byte_count = 3 /* pad */ + params + count; pSMB->DataCount = cpu_to_le16(count); parm_data->list_len = cpu_to_le32(count); - parm_data->list[0].EA_flags = 0; + parm_data->list.EA_flags = 0; /* we checked above that name len is less than 255 */ - parm_data->list[0].name_len = (__u8)name_len; + parm_data->list.name_len = (__u8)name_len; /* EA names are always ASCII */ if (ea_name) - strncpy(parm_data->list[0].name, ea_name, name_len); - parm_data->list[0].name[name_len] = 0; - parm_data->list[0].value_len = cpu_to_le16(ea_value_len); + strncpy(parm_data->list.name, ea_name, name_len); + parm_data->list.name[name_len] = '\0'; + parm_data->list.value_len = cpu_to_le16(ea_value_len); /* caller ensures that ea_value_len is less than 64K but we need to ensure that it fits within the smb */ @@ -5940,7 +5937,7 @@ SetEARetry: negotiated SMB buffer size BB */ /* if (ea_value_len > buffer_size - 512 (enough for header)) */ if (ea_value_len) - memcpy(parm_data->list[0].name+name_len+1, + memcpy(parm_data->list.name + name_len + 1, ea_value, ea_value_len); pSMB->TotalDataCount = pSMB->DataCount; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b2a04b4e89a5..ec020d860be3 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -79,8 +79,6 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) int len; char *unc; struct sockaddr_storage ss; - time64_t expiry, now; - unsigned long ttl = SMB_DNS_RESOLVE_INTERVAL_DEFAULT; if (!server->hostname) return -EINVAL; @@ -102,29 +100,19 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) ss = server->dstaddr; spin_unlock(&server->srv_lock); - rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, &expiry); + rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, NULL); kfree(unc); if (rc < 0) { cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n", __func__, server->hostname, rc); - goto requeue_resolve; + } else { + spin_lock(&server->srv_lock); + memcpy(&server->dstaddr, &ss, sizeof(server->dstaddr)); + spin_unlock(&server->srv_lock); + rc = 0; } - spin_lock(&server->srv_lock); - memcpy(&server->dstaddr, &ss, sizeof(server->dstaddr)); - spin_unlock(&server->srv_lock); - - now = ktime_get_real_seconds(); - if (expiry && expiry > now) - /* To make sure we don't use the cached entry, retry 1s */ - ttl = max_t(unsigned long, expiry - now, SMB_DNS_RESOLVE_INTERVAL_MIN) + 1; - -requeue_resolve: - cifs_dbg(FYI, "%s: next dns resolution scheduled for %lu seconds in the future\n", - __func__, ttl); - mod_delayed_work(cifsiod_wq, &server->resolve, (ttl * HZ)); - return rc; } @@ -148,26 +136,6 @@ static void smb2_query_server_interfaces(struct work_struct *work) (SMB_INTERFACE_POLL_INTERVAL * HZ)); } -static void cifs_resolve_server(struct work_struct *work) -{ - int rc; - struct TCP_Server_Info *server = container_of(work, - struct TCP_Server_Info, resolve.work); - - cifs_server_lock(server); - - /* - * Resolve the hostname again to make sure that IP address is up-to-date. - */ - rc = reconn_set_ipaddr_from_hostname(server); - if (rc) { - cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", - __func__, rc); - } - - cifs_server_unlock(server); -} - /* * Update the tcpStatus for the server. * This is used to signal the cifsd thread to call cifs_reconnect @@ -759,12 +727,27 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, unsigned int to_read) { struct msghdr smb_msg = {}; - struct bio_vec bv = { - .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; + struct bio_vec bv; + + bvec_set_page(&bv, page, to_read, page_offset); iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } +int +cifs_read_iter_from_socket(struct TCP_Server_Info *server, struct iov_iter *iter, + unsigned int to_read) +{ + struct msghdr smb_msg = { .msg_iter = *iter }; + int ret; + + iov_iter_truncate(&smb_msg.msg_iter, to_read); + ret = cifs_readv_from_socket(server, &smb_msg); + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + static bool is_smb_response(struct TCP_Server_Info *server, unsigned char type) { @@ -925,7 +908,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) spin_unlock(&server->srv_lock); cancel_delayed_work_sync(&server->echo); - cancel_delayed_work_sync(&server->resolve); spin_lock(&server->srv_lock); server->tcpStatus = CifsExiting; @@ -1549,7 +1531,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) cifs_put_tcp_session(server->primary_server, from_reconnect); cancel_delayed_work_sync(&server->echo); - cancel_delayed_work_sync(&server->resolve); if (from_reconnect) /* @@ -1655,7 +1636,6 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); INIT_LIST_HEAD(&tcp_ses->smb_ses_list); INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); - INIT_DELAYED_WORK(&tcp_ses->resolve, cifs_resolve_server); INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server); mutex_init(&tcp_ses->reconnect_mutex); #ifdef CONFIG_CIFS_DFS_UPCALL @@ -1744,12 +1724,6 @@ smbd_connected: /* queue echo request delayed work */ queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval); - /* queue dns resolution delayed work */ - cifs_dbg(FYI, "%s: next dns resolution scheduled for %d seconds in the future\n", - __func__, SMB_DNS_RESOLVE_INTERVAL_DEFAULT); - - queue_delayed_work(cifsiod_wq, &tcp_ses->resolve, (SMB_DNS_RESOLVE_INTERVAL_DEFAULT * HZ)); - return tcp_ses; out_err_crypto_release: @@ -2843,72 +2817,48 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * negprot - BB check reconnection in case where second * sessinit is sent but no second negprot */ - struct rfc1002_session_packet *ses_init_buf; - unsigned int req_noscope_len; - struct smb_hdr *smb_buf; + struct rfc1002_session_packet req = {}; + struct smb_hdr *smb_buf = (struct smb_hdr *)&req; + unsigned int len; - ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), - GFP_KERNEL); + req.trailer.session_req.called_len = sizeof(req.trailer.session_req.called_name); - if (ses_init_buf) { - ses_init_buf->trailer.session_req.called_len = 32; + if (server->server_RFC1001_name[0] != 0) + rfc1002mangle(req.trailer.session_req.called_name, + server->server_RFC1001_name, + RFC1001_NAME_LEN_WITH_NULL); + else + rfc1002mangle(req.trailer.session_req.called_name, + DEFAULT_CIFS_CALLED_NAME, + RFC1001_NAME_LEN_WITH_NULL); - if (server->server_RFC1001_name[0] != 0) - rfc1002mangle(ses_init_buf->trailer. - session_req.called_name, - server->server_RFC1001_name, - RFC1001_NAME_LEN_WITH_NULL); - else - rfc1002mangle(ses_init_buf->trailer. - session_req.called_name, - DEFAULT_CIFS_CALLED_NAME, - RFC1001_NAME_LEN_WITH_NULL); + req.trailer.session_req.calling_len = sizeof(req.trailer.session_req.calling_name); - ses_init_buf->trailer.session_req.calling_len = 32; + /* calling name ends in null (byte 16) from old smb convention */ + if (server->workstation_RFC1001_name[0] != 0) + rfc1002mangle(req.trailer.session_req.calling_name, + server->workstation_RFC1001_name, + RFC1001_NAME_LEN_WITH_NULL); + else + rfc1002mangle(req.trailer.session_req.calling_name, + "LINUX_CIFS_CLNT", + RFC1001_NAME_LEN_WITH_NULL); - /* - * calling name ends in null (byte 16) from old smb - * convention. - */ - if (server->workstation_RFC1001_name[0] != 0) - rfc1002mangle(ses_init_buf->trailer. - session_req.calling_name, - server->workstation_RFC1001_name, - RFC1001_NAME_LEN_WITH_NULL); - else - rfc1002mangle(ses_init_buf->trailer. - session_req.calling_name, - "LINUX_CIFS_CLNT", - RFC1001_NAME_LEN_WITH_NULL); - - ses_init_buf->trailer.session_req.scope1 = 0; - ses_init_buf->trailer.session_req.scope2 = 0; - smb_buf = (struct smb_hdr *)ses_init_buf; - - /* sizeof RFC1002_SESSION_REQUEST with no scopes */ - req_noscope_len = sizeof(struct rfc1002_session_packet) - 2; - - /* == cpu_to_be32(0x81000044) */ - smb_buf->smb_buf_length = - cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len); - rc = smb_send(server, smb_buf, 0x44); - kfree(ses_init_buf); - /* - * RFC1001 layer in at least one server - * requires very short break before negprot - * presumably because not expecting negprot - * to follow so fast. This is a simple - * solution that works without - * complicating the code and causes no - * significant slowing down on mount - * for everyone else - */ - usleep_range(1000, 2000); - } /* - * else the negprot may still work without this - * even though malloc failed + * As per rfc1002, @len must be the number of bytes that follows the + * length field of a rfc1002 session request payload. */ + len = sizeof(req) - offsetof(struct rfc1002_session_packet, trailer.session_req); + + smb_buf->smb_buf_length = cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | len); + rc = smb_send(server, smb_buf, len); + /* + * RFC1001 layer in at least one server requires very short break before + * negprot presumably because not expecting negprot to follow so fast. + * This is a simple solution that works without complicating the code + * and causes no significant slowing down on mount for everyone else + */ + usleep_range(1000, 2000); return rc; } @@ -3759,16 +3709,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info) { int rc = -ENOSYS; - struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; - struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; + struct TCP_Server_Info *pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr; + struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr; bool is_binding = false; spin_lock(&ses->ses_lock); - if (server->dstaddr.ss_family == AF_INET6) - scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); - else - scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); - if (ses->ses_status != SES_GOOD && ses->ses_status != SES_NEW && ses->ses_status != SES_NEED_RECON) { @@ -3792,6 +3738,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, ses->ses_status = SES_IN_SETUP; spin_unlock(&ses->ses_lock); + /* update ses ip_addr only for primary chan */ + if (server == pserver) { + if (server->dstaddr.ss_family == AF_INET6) + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); + else + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); + } + if (!is_binding) { ses->capabilities = server->capabilities; if (!linuxExtEnabled) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index ad4208bf1e32..30b1e1bfd204 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -304,15 +304,16 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned if (!tcon->unix_ext && (mode & S_IWUGO) == 0) create_options |= CREATE_OPTION_READONLY; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = desired_access; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.disposition = disposition; - oparms.path = full_path; - oparms.fid = fid; - oparms.reconnect = false; - oparms.mode = mode; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = desired_access, + .create_options = cifs_create_options(cifs_sb, create_options), + .disposition = disposition, + .path = full_path, + .fid = fid, + .mode = mode, + }; rc = server->ops->open(xid, &oparms, oplock, buf); if (rc) { cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc); @@ -529,7 +530,7 @@ out_free_xid: return rc; } -int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, +int cifs_create(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, umode_t mode, bool excl) { int rc; @@ -579,7 +580,7 @@ out_free_xid: return rc; } -int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, +int cifs_mknod(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, umode_t mode, dev_t device_number) { int rc = -EPERM; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 22dfc1f8b4f1..ebfcaae8c437 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -9,6 +9,7 @@ * */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/backing-dev.h> #include <linux/stat.h> #include <linux/fcntl.h> @@ -37,6 +38,125 @@ #include "cached_dir.h" /* + * Remove the dirty flags from a span of pages. + */ +static void cifs_undirty_folios(struct inode *inode, loff_t start, unsigned int len) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t end; + + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); + + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each_marked(&xas, folio, end, PAGECACHE_TAG_DIRTY) { + xas_pause(&xas); + rcu_read_unlock(); + folio_lock(folio); + folio_clear_dirty_for_io(folio); + folio_unlock(folio); + rcu_read_lock(); + } + + rcu_read_unlock(); +} + +/* + * Completion of write to server. + */ +void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t end; + + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); + + if (!len) + return; + + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, end) { + if (!folio_test_writeback(folio)) { + WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", + len, start, folio_index(folio), end); + continue; + } + + folio_detach_private(folio); + folio_end_writeback(folio); + } + + rcu_read_unlock(); +} + +/* + * Failure of write to server. + */ +void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t end; + + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); + + if (!len) + return; + + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, end) { + if (!folio_test_writeback(folio)) { + WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", + len, start, folio_index(folio), end); + continue; + } + + folio_set_error(folio); + folio_end_writeback(folio); + } + + rcu_read_unlock(); +} + +/* + * Redirty pages after a temporary failure. + */ +void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int len) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t end; + + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); + + if (!len) + return; + + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, end) { + if (!folio_test_writeback(folio)) { + WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", + len, start, folio_index(folio), end); + continue; + } + + filemap_dirty_folio(folio->mapping, folio); + folio_end_writeback(folio); + } + + rcu_read_unlock(); +} + +/* * Mark as invalid, all open files on tree connections since they * were closed when session to server was lost. */ @@ -260,14 +380,15 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_ if (f_flags & O_DIRECT) create_options |= CREATE_NO_BUFFER; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = desired_access; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.disposition = disposition; - oparms.path = full_path; - oparms.fid = fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = desired_access, + .create_options = cifs_create_options(cifs_sb, create_options), + .disposition = disposition, + .path = full_path, + .fid = fid, + }; rc = server->ops->open(xid, &oparms, oplock, buf); if (rc) @@ -848,14 +969,16 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &cfile->fid); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = desired_access; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.disposition = disposition; - oparms.path = full_path; - oparms.fid = &cfile->fid; - oparms.reconnect = true; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = desired_access, + .create_options = cifs_create_options(cifs_sb, create_options), + .disposition = disposition, + .path = full_path, + .fid = &cfile->fid, + .reconnect = true, + }; /* * Can not refresh inode by passing in file_info buf to be returned by @@ -2295,7 +2418,6 @@ cifs_writedata_release(struct kref *refcount) if (wdata->cfile) cifsFileInfo_put(wdata->cfile); - kvfree(wdata->pages); kfree(wdata); } @@ -2306,51 +2428,49 @@ cifs_writedata_release(struct kref *refcount) static void cifs_writev_requeue(struct cifs_writedata *wdata) { - int i, rc = 0; + int rc = 0; struct inode *inode = d_inode(wdata->cfile->dentry); struct TCP_Server_Info *server; - unsigned int rest_len; + unsigned int rest_len = wdata->bytes; + loff_t fpos = wdata->offset; server = tlink_tcon(wdata->cfile->tlink)->ses->server; - i = 0; - rest_len = wdata->bytes; do { struct cifs_writedata *wdata2; - unsigned int j, nr_pages, wsize, tailsz, cur_len; + unsigned int wsize, cur_len; wsize = server->ops->wp_retry_size(inode); if (wsize < rest_len) { - nr_pages = wsize / PAGE_SIZE; - if (!nr_pages) { + if (wsize < PAGE_SIZE) { rc = -EOPNOTSUPP; break; } - cur_len = nr_pages * PAGE_SIZE; - tailsz = PAGE_SIZE; + cur_len = min(round_down(wsize, PAGE_SIZE), rest_len); } else { - nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE); cur_len = rest_len; - tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE; } - wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete); + wdata2 = cifs_writedata_alloc(cifs_writev_complete); if (!wdata2) { rc = -ENOMEM; break; } - for (j = 0; j < nr_pages; j++) { - wdata2->pages[j] = wdata->pages[i + j]; - lock_page(wdata2->pages[j]); - clear_page_dirty_for_io(wdata2->pages[j]); - } - wdata2->sync_mode = wdata->sync_mode; - wdata2->nr_pages = nr_pages; - wdata2->offset = page_offset(wdata2->pages[0]); - wdata2->pagesz = PAGE_SIZE; - wdata2->tailsz = tailsz; - wdata2->bytes = cur_len; + wdata2->offset = fpos; + wdata2->bytes = cur_len; + wdata2->iter = wdata->iter; + + iov_iter_advance(&wdata2->iter, fpos - wdata->offset); + iov_iter_truncate(&wdata2->iter, wdata2->bytes); + + if (iov_iter_is_xarray(&wdata2->iter)) + /* Check for pages having been redirtied and clean + * them. We can do this by walking the xarray. If + * it's not an xarray, then it's a DIO and we shouldn't + * be mucking around with the page bits. + */ + cifs_undirty_folios(inode, fpos, cur_len); rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &wdata2->cfile); @@ -2365,33 +2485,22 @@ cifs_writev_requeue(struct cifs_writedata *wdata) cifs_writedata_release); } - for (j = 0; j < nr_pages; j++) { - unlock_page(wdata2->pages[j]); - if (rc != 0 && !is_retryable_error(rc)) { - SetPageError(wdata2->pages[j]); - end_page_writeback(wdata2->pages[j]); - put_page(wdata2->pages[j]); - } - } - kref_put(&wdata2->refcount, cifs_writedata_release); if (rc) { if (is_retryable_error(rc)) continue; - i += nr_pages; + fpos += cur_len; + rest_len -= cur_len; break; } + fpos += cur_len; rest_len -= cur_len; - i += nr_pages; - } while (i < wdata->nr_pages); + } while (rest_len > 0); - /* cleanup remaining pages from the original wdata */ - for (; i < wdata->nr_pages; i++) { - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - put_page(wdata->pages[i]); - } + /* Clean up remaining pages from the original wdata */ + if (iov_iter_is_xarray(&wdata->iter)) + cifs_pages_write_failed(inode, fpos, rest_len); if (rc != 0 && !is_retryable_error(rc)) mapping_set_error(inode->i_mapping, rc); @@ -2404,7 +2513,6 @@ cifs_writev_complete(struct work_struct *work) struct cifs_writedata *wdata = container_of(work, struct cifs_writedata, work); struct inode *inode = d_inode(wdata->cfile->dentry); - int i = 0; if (wdata->result == 0) { spin_lock(&inode->i_lock); @@ -2415,45 +2523,24 @@ cifs_writev_complete(struct work_struct *work) } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN) return cifs_writev_requeue(wdata); - for (i = 0; i < wdata->nr_pages; i++) { - struct page *page = wdata->pages[i]; + if (wdata->result == -EAGAIN) + cifs_pages_write_redirty(inode, wdata->offset, wdata->bytes); + else if (wdata->result < 0) + cifs_pages_write_failed(inode, wdata->offset, wdata->bytes); + else + cifs_pages_written_back(inode, wdata->offset, wdata->bytes); - if (wdata->result == -EAGAIN) - __set_page_dirty_nobuffers(page); - else if (wdata->result < 0) - SetPageError(page); - end_page_writeback(page); - cifs_readpage_to_fscache(inode, page); - put_page(page); - } if (wdata->result != -EAGAIN) mapping_set_error(inode->i_mapping, wdata->result); kref_put(&wdata->refcount, cifs_writedata_release); } -struct cifs_writedata * -cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) -{ - struct cifs_writedata *writedata = NULL; - struct page **pages = - kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (pages) { - writedata = cifs_writedata_direct_alloc(pages, complete); - if (!writedata) - kvfree(pages); - } - - return writedata; -} - -struct cifs_writedata * -cifs_writedata_direct_alloc(struct page **pages, work_func_t complete) +struct cifs_writedata *cifs_writedata_alloc(work_func_t complete) { struct cifs_writedata *wdata; wdata = kzalloc(sizeof(*wdata), GFP_NOFS); if (wdata != NULL) { - wdata->pages = pages; kref_init(&wdata->refcount); INIT_LIST_HEAD(&wdata->list); init_completion(&wdata->done); @@ -2462,7 +2549,6 @@ cifs_writedata_direct_alloc(struct page **pages, work_func_t complete) return wdata; } - static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) { struct address_space *mapping = page->mapping; @@ -2521,310 +2607,387 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) return rc; } -static struct cifs_writedata * -wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, - pgoff_t end, pgoff_t *index, - unsigned int *found_pages) +/* + * Extend the region to be written back to include subsequent contiguously + * dirty pages if possible, but don't sleep while doing so. + */ +static void cifs_extend_writeback(struct address_space *mapping, + long *_count, + loff_t start, + int max_pages, + size_t max_len, + unsigned int *_len) { - struct cifs_writedata *wdata; - - wdata = cifs_writedata_alloc((unsigned int)tofind, - cifs_writev_complete); - if (!wdata) - return NULL; - - *found_pages = find_get_pages_range_tag(mapping, index, end, - PAGECACHE_TAG_DIRTY, tofind, wdata->pages); - return wdata; -} + struct folio_batch batch; + struct folio *folio; + unsigned int psize, nr_pages; + size_t len = *_len; + pgoff_t index = (start + len) / PAGE_SIZE; + bool stop = true; + unsigned int i; + XA_STATE(xas, &mapping->i_pages, index); -static unsigned int -wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, - struct address_space *mapping, - struct writeback_control *wbc, - pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done) -{ - unsigned int nr_pages = 0, i; - struct page *page; + folio_batch_init(&batch); - for (i = 0; i < found_pages; i++) { - page = wdata->pages[i]; - /* - * At this point we hold neither the i_pages lock nor the - * page lock: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled - * back from swapper_space to tmpfs file mapping + do { + /* Firstly, we gather up a batch of contiguous dirty pages + * under the RCU read lock - but we can't clear the dirty flags + * there if any of those pages are mapped. */ + rcu_read_lock(); - if (nr_pages == 0) - lock_page(page); - else if (!trylock_page(page)) - break; - - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - break; - } + xas_for_each(&xas, folio, ULONG_MAX) { + stop = true; + if (xas_retry(&xas, folio)) + continue; + if (xa_is_value(folio)) + break; + if (folio_index(folio) != index) + break; + if (!folio_try_get_rcu(folio)) { + xas_reset(&xas); + continue; + } + nr_pages = folio_nr_pages(folio); + if (nr_pages > max_pages) + break; - if (!wbc->range_cyclic && page->index > end) { - *done = true; - unlock_page(page); - break; - } + /* Has the page moved or been split? */ + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); + break; + } - if (*next && (page->index != *next)) { - /* Not next consecutive page */ - unlock_page(page); - break; - } + if (!folio_trylock(folio)) { + folio_put(folio); + break; + } + if (!folio_test_dirty(folio) || folio_test_writeback(folio)) { + folio_unlock(folio); + folio_put(folio); + break; + } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + max_pages -= nr_pages; + psize = folio_size(folio); + len += psize; + stop = false; + if (max_pages <= 0 || len >= max_len || *_count <= 0) + stop = true; - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - break; + index += nr_pages; + if (!folio_batch_add(&batch, folio)) + break; + if (stop) + break; } - /* - * This actually clears the dirty bit in the radix tree. - * See cifs_writepage() for more commentary. + if (!stop) + xas_pause(&xas); + rcu_read_unlock(); + + /* Now, if we obtained any pages, we can shift them to being + * writable and mark them for caching. */ - set_page_writeback(page); - if (page_offset(page) >= i_size_read(mapping->host)) { - *done = true; - unlock_page(page); - end_page_writeback(page); + if (!folio_batch_count(&batch)) break; - } - wdata->pages[i] = page; - *next = page->index + 1; - ++nr_pages; - } + for (i = 0; i < folio_batch_count(&batch); i++) { + folio = batch.folios[i]; + /* The folio should be locked, dirty and not undergoing + * writeback from the loop above. + */ + if (!folio_clear_dirty_for_io(folio)) + WARN_ON(1); + if (folio_start_writeback(folio)) + WARN_ON(1); - /* reset index to refind any pages skipped */ - if (nr_pages == 0) - *index = wdata->pages[0]->index + 1; + *_count -= folio_nr_pages(folio); + folio_unlock(folio); + } - /* put any pages we aren't going to use */ - for (i = nr_pages; i < found_pages; i++) { - put_page(wdata->pages[i]); - wdata->pages[i] = NULL; - } + folio_batch_release(&batch); + cond_resched(); + } while (!stop); - return nr_pages; + *_len = len; } -static int -wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, - struct address_space *mapping, struct writeback_control *wbc) +/* + * Write back the locked page and any subsequent non-locked dirty pages. + */ +static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping, + struct writeback_control *wbc, + struct folio *folio, + loff_t start, loff_t end) { + struct inode *inode = mapping->host; + struct TCP_Server_Info *server; + struct cifs_writedata *wdata; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_credits credits_on_stack; + struct cifs_credits *credits = &credits_on_stack; + struct cifsFileInfo *cfile = NULL; + unsigned int xid, wsize, len; + loff_t i_size = i_size_read(inode); + size_t max_len; + long count = wbc->nr_to_write; int rc; - wdata->sync_mode = wbc->sync_mode; - wdata->nr_pages = nr_pages; - wdata->offset = page_offset(wdata->pages[0]); - wdata->pagesz = PAGE_SIZE; - wdata->tailsz = min(i_size_read(mapping->host) - - page_offset(wdata->pages[nr_pages - 1]), - (loff_t)PAGE_SIZE); - wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz; - wdata->pid = wdata->cfile->pid; - - rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes); - if (rc) - return rc; + /* The folio should be locked, dirty and not undergoing writeback. */ + if (folio_start_writeback(folio)) + WARN_ON(1); - if (wdata->cfile->invalidHandle) - rc = -EAGAIN; - else - rc = wdata->server->ops->async_writev(wdata, - cifs_writedata_release); - - return rc; -} + count -= folio_nr_pages(folio); + len = folio_size(folio); -static int -cifs_writepage_locked(struct page *page, struct writeback_control *wbc); + xid = get_xid(); + server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses); -static int cifs_write_one_page(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret; + rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile); + if (rc) { + cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", rc); + goto err_xid; + } - ret = cifs_writepage_locked(page, wbc); - unlock_page(page); - mapping_set_error(mapping, ret); - return ret; -} + rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize, + &wsize, credits); + if (rc != 0) + goto err_close; -static int cifs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct inode *inode = mapping->host; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct TCP_Server_Info *server; - bool done = false, scanned = false, range_whole = false; - pgoff_t end, index; - struct cifs_writedata *wdata; - struct cifsFileInfo *cfile = NULL; - int rc = 0; - int saved_rc = 0; - unsigned int xid; + wdata = cifs_writedata_alloc(cifs_writev_complete); + if (!wdata) { + rc = -ENOMEM; + goto err_uncredit; + } - /* - * If wsize is smaller than the page cache size, default to writing - * one page at a time. + wdata->sync_mode = wbc->sync_mode; + wdata->offset = folio_pos(folio); + wdata->pid = cfile->pid; + wdata->credits = credits_on_stack; + wdata->cfile = cfile; + wdata->server = server; + cfile = NULL; + + /* Find all consecutive lockable dirty pages, stopping when we find a + * page that is not immediately lockable, is not dirty or is missing, + * or we reach the end of the range. */ - if (cifs_sb->ctx->wsize < PAGE_SIZE) - return write_cache_pages(mapping, wbc, cifs_write_one_page, - mapping); + if (start < i_size) { + /* Trim the write to the EOF; the extra data is ignored. Also + * put an upper limit on the size of a single storedata op. + */ + max_len = wsize; + max_len = min_t(unsigned long long, max_len, end - start + 1); + max_len = min_t(unsigned long long, max_len, i_size - start); - xid = get_xid(); - if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ - end = -1; - } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = true; - scanned = true; + if (len < max_len) { + int max_pages = INT_MAX; + +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) + max_pages = server->smbd_conn->max_frmr_depth; +#endif + max_pages -= folio_nr_pages(folio); + + if (max_pages > 0) + cifs_extend_writeback(mapping, &count, start, + max_pages, max_len, &len); + } + len = min_t(loff_t, len, max_len); } - server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses); -retry: - while (!done && index <= end) { - unsigned int i, nr_pages, found_pages, wsize; - pgoff_t next = 0, tofind, saved_index = index; - struct cifs_credits credits_on_stack; - struct cifs_credits *credits = &credits_on_stack; - int get_file_rc = 0; + wdata->bytes = len; - if (cfile) - cifsFileInfo_put(cfile); + /* We now have a contiguous set of dirty pages, each with writeback + * set; the first page is still locked at this point, but all the rest + * have been unlocked. + */ + folio_unlock(folio); - rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile); + if (start < i_size) { + iov_iter_xarray(&wdata->iter, ITER_SOURCE, &mapping->i_pages, + start, len); - /* in case of an error store it to return later */ + rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes); if (rc) - get_file_rc = rc; + goto err_wdata; - rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize, - &wsize, credits); - if (rc != 0) { - done = true; - break; + if (wdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = wdata->server->ops->async_writev(wdata, + cifs_writedata_release); + if (rc >= 0) { + kref_put(&wdata->refcount, cifs_writedata_release); + goto err_close; } + } else { + /* The dirty region was entirely beyond the EOF. */ + cifs_pages_written_back(inode, start, len); + rc = 0; + } + +err_wdata: + kref_put(&wdata->refcount, cifs_writedata_release); +err_uncredit: + add_credits_and_wake_if(server, credits, 0); +err_close: + if (cfile) + cifsFileInfo_put(cfile); +err_xid: + free_xid(xid); + if (rc == 0) { + wbc->nr_to_write = count; + } else if (is_retryable_error(rc)) { + cifs_pages_write_redirty(inode, start, len); + } else { + cifs_pages_write_failed(inode, start, len); + mapping_set_error(mapping, rc); + } + /* Indication to update ctime and mtime as close is deferred */ + set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); + return rc; +} - tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1; +/* + * write a region of pages back to the server + */ +static int cifs_writepages_region(struct address_space *mapping, + struct writeback_control *wbc, + loff_t start, loff_t end, loff_t *_next) +{ + struct folio_batch fbatch; + int skips = 0; - wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index, - &found_pages); - if (!wdata) { - rc = -ENOMEM; - done = true; - add_credits_and_wake_if(server, credits, 0); - break; - } + folio_batch_init(&fbatch); + do { + int nr; + pgoff_t index = start / PAGE_SIZE; - if (found_pages == 0) { - kref_put(&wdata->refcount, cifs_writedata_release); - add_credits_and_wake_if(server, credits, 0); + nr = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE, + PAGECACHE_TAG_DIRTY, &fbatch); + if (!nr) break; - } - nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc, - end, &index, &next, &done); + for (int i = 0; i < nr; i++) { + ssize_t ret; + struct folio *folio = fbatch.folios[i]; - /* nothing to write? */ - if (nr_pages == 0) { - kref_put(&wdata->refcount, cifs_writedata_release); - add_credits_and_wake_if(server, credits, 0); - continue; - } +redo_folio: + start = folio_pos(folio); /* May regress with THPs */ + + /* At this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = folio_lock_killable(folio); + if (ret < 0) + goto write_error; + } else { + if (!folio_trylock(folio)) + goto skip_write; + } - wdata->credits = credits_on_stack; - wdata->cfile = cfile; - wdata->server = server; - cfile = NULL; + if (folio_mapping(folio) != mapping || + !folio_test_dirty(folio)) { + start += folio_size(folio); + folio_unlock(folio); + continue; + } - if (!wdata->cfile) { - cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", - get_file_rc); - if (is_retryable_error(get_file_rc)) - rc = get_file_rc; - else - rc = -EBADF; - } else - rc = wdata_send_pages(wdata, nr_pages, mapping, wbc); + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + if (wbc->sync_mode == WB_SYNC_NONE) + goto skip_write; - for (i = 0; i < nr_pages; ++i) - unlock_page(wdata->pages[i]); + folio_wait_writeback(folio); +#ifdef CONFIG_CIFS_FSCACHE + folio_wait_fscache(folio); +#endif + goto redo_folio; + } - /* send failure -- clean up the mess */ - if (rc != 0) { - add_credits_and_wake_if(server, &wdata->credits, 0); - for (i = 0; i < nr_pages; ++i) { - if (is_retryable_error(rc)) - redirty_page_for_writepage(wbc, - wdata->pages[i]); - else - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - put_page(wdata->pages[i]); + if (!folio_clear_dirty_for_io(folio)) + /* We hold the page lock - it should've been dirty. */ + WARN_ON(1); + + ret = cifs_write_back_from_locked_folio(mapping, wbc, folio, start, end); + if (ret < 0) + goto write_error; + + start += ret; + continue; + +write_error: + folio_batch_release(&fbatch); + *_next = start; + return ret; + +skip_write: + /* + * Too many skipped writes, or need to reschedule? + * Treat it as a write error without an error code. + */ + if (skips >= 5 || need_resched()) { + ret = 0; + goto write_error; } - if (!is_retryable_error(rc)) - mapping_set_error(mapping, rc); - } - kref_put(&wdata->refcount, cifs_writedata_release); - if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) { - index = saved_index; + /* Otherwise, just skip that folio and go on to the next */ + skips++; + start += folio_size(folio); continue; } - /* Return immediately if we received a signal during writing */ - if (is_interrupt_error(rc)) { - done = true; - break; - } + folio_batch_release(&fbatch); + cond_resched(); + } while (wbc->nr_to_write > 0); - if (rc != 0 && saved_rc == 0) - saved_rc = rc; + *_next = start; + return 0; +} - wbc->nr_to_write -= nr_pages; - if (wbc->nr_to_write <= 0) - done = true; +/* + * Write some of the pending data back to the server + */ +static int cifs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + loff_t start, next; + int ret; - index = next; - } + /* We have to be careful as we can end up racing with setattr() + * truncating the pagecache since the caller doesn't take a lock here + * to prevent it. + */ - if (!scanned && !done) { - /* - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - scanned = true; - index = 0; - goto retry; + if (wbc->range_cyclic) { + start = mapping->writeback_index * PAGE_SIZE; + ret = cifs_writepages_region(mapping, wbc, start, LLONG_MAX, &next); + if (ret == 0) { + mapping->writeback_index = next / PAGE_SIZE; + if (start > 0 && wbc->nr_to_write > 0) { + ret = cifs_writepages_region(mapping, wbc, 0, + start, &next); + if (ret == 0) + mapping->writeback_index = + next / PAGE_SIZE; + } + } + } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { + ret = cifs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next); + if (wbc->nr_to_write > 0 && ret == 0) + mapping->writeback_index = next / PAGE_SIZE; + } else { + ret = cifs_writepages_region(mapping, wbc, + wbc->range_start, wbc->range_end, &next); } - if (saved_rc != 0) - rc = saved_rc; - - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; - - if (cfile) - cifsFileInfo_put(cfile); - free_xid(xid); - /* Indication to update ctime and mtime as close is deferred */ - set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); - return rc; + return ret; } static int @@ -2876,6 +3039,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct cifsFileInfo *cfile = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct folio *folio = page_folio(page); __u32 pid; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) @@ -2886,14 +3050,14 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n", page, pos, copied); - if (PageChecked(page)) { + if (folio_test_checked(folio)) { if (copied == len) - SetPageUptodate(page); - ClearPageChecked(page); - } else if (!PageUptodate(page) && copied == PAGE_SIZE) - SetPageUptodate(page); + folio_mark_uptodate(folio); + folio_clear_checked(folio); + } else if (!folio_test_uptodate(folio) && copied == PAGE_SIZE) + folio_mark_uptodate(folio); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { char *page_data; unsigned offset = pos & (PAGE_SIZE - 1); unsigned int xid; @@ -3053,57 +3217,13 @@ int cifs_flush(struct file *file, fl_owner_t id) return rc; } -static int -cifs_write_allocate_pages(struct page **pages, unsigned long num_pages) -{ - int rc = 0; - unsigned long i; - - for (i = 0; i < num_pages; i++) { - pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!pages[i]) { - /* - * save number of pages we have already allocated and - * return with ENOMEM error - */ - num_pages = i; - rc = -ENOMEM; - break; - } - } - - if (rc) { - for (i = 0; i < num_pages; i++) - put_page(pages[i]); - } - return rc; -} - -static inline -size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len) -{ - size_t num_pages; - size_t clen; - - clen = min_t(const size_t, len, wsize); - num_pages = DIV_ROUND_UP(clen, PAGE_SIZE); - - if (cur_len) - *cur_len = clen; - - return num_pages; -} - static void cifs_uncached_writedata_release(struct kref *refcount) { - int i; struct cifs_writedata *wdata = container_of(refcount, struct cifs_writedata, refcount); kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release); - for (i = 0; i < wdata->nr_pages; i++) - put_page(wdata->pages[i]); cifs_writedata_release(refcount); } @@ -3130,48 +3250,6 @@ cifs_uncached_writev_complete(struct work_struct *work) } static int -wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, - size_t *len, unsigned long *num_pages) -{ - size_t save_len, copied, bytes, cur_len = *len; - unsigned long i, nr_pages = *num_pages; - - save_len = cur_len; - for (i = 0; i < nr_pages; i++) { - bytes = min_t(const size_t, cur_len, PAGE_SIZE); - copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from); - cur_len -= copied; - /* - * If we didn't copy as much as we expected, then that - * may mean we trod into an unmapped area. Stop copying - * at that point. On the next pass through the big - * loop, we'll likely end up getting a zero-length - * write and bailing out of it. - */ - if (copied < bytes) - break; - } - cur_len = save_len - cur_len; - *len = cur_len; - - /* - * If we have no data to send, then that probably means that - * the copy above failed altogether. That's most likely because - * the address in the iovec was bogus. Return -EFAULT and let - * the caller free anything we allocated and bail out. - */ - if (!cur_len) - return -EFAULT; - - /* - * i + 1 now represents the number of pages we actually used in - * the copy phase above. - */ - *num_pages = i + 1; - return 0; -} - -static int cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, struct cifs_aio_ctx *ctx) { @@ -3241,23 +3319,57 @@ fail: return rc; } +/* + * Select span of a bvec iterator we're going to use. Limit it by both maximum + * size and maximum number of segments. + */ +static size_t cifs_limit_bvec_subset(const struct iov_iter *iter, size_t max_size, + size_t max_segs, unsigned int *_nsegs) +{ + const struct bio_vec *bvecs = iter->bvec; + unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0; + size_t len, span = 0, n = iter->count; + size_t skip = iter->iov_offset; + + if (WARN_ON(!iov_iter_is_bvec(iter)) || n == 0) + return 0; + + while (n && ix < nbv && skip) { + len = bvecs[ix].bv_len; + if (skip < len) + break; + skip -= len; + n -= len; + ix++; + } + + while (n && ix < nbv) { + len = min3(n, bvecs[ix].bv_len - skip, max_size); + span += len; + nsegs++; + ix++; + if (span >= max_size || nsegs >= max_segs) + break; + skip = 0; + n -= len; + } + + *_nsegs = nsegs; + return span; +} + static int -cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, +cifs_write_from_iter(loff_t fpos, size_t len, struct iov_iter *from, struct cifsFileInfo *open_file, struct cifs_sb_info *cifs_sb, struct list_head *wdata_list, struct cifs_aio_ctx *ctx) { int rc = 0; - size_t cur_len; - unsigned long nr_pages, num_pages, i; + size_t cur_len, max_len; struct cifs_writedata *wdata; - struct iov_iter saved_from = *from; - loff_t saved_offset = offset; pid_t pid; struct TCP_Server_Info *server; - struct page **pagevec; - size_t start; - unsigned int xid; + unsigned int xid, max_segs = INT_MAX; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -3267,10 +3379,20 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); xid = get_xid(); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) + max_segs = server->smbd_conn->max_frmr_depth; +#endif + do { - unsigned int wsize; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; + unsigned int wsize, nsegs = 0; + + if (signal_pending(current)) { + rc = -EINTR; + break; + } if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, false); @@ -3285,99 +3407,42 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, if (rc) break; - cur_len = min_t(const size_t, len, wsize); - - if (ctx->direct_io) { - ssize_t result; - - result = iov_iter_get_pages_alloc2( - from, &pagevec, cur_len, &start); - if (result < 0) { - cifs_dbg(VFS, - "direct_writev couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", - result, iov_iter_type(from), - from->iov_offset, from->count); - dump_stack(); - - rc = result; - add_credits_and_wake_if(server, credits, 0); - break; - } - cur_len = (size_t)result; - - nr_pages = - (cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE; - - wdata = cifs_writedata_direct_alloc(pagevec, - cifs_uncached_writev_complete); - if (!wdata) { - rc = -ENOMEM; - for (i = 0; i < nr_pages; i++) - put_page(pagevec[i]); - kvfree(pagevec); - add_credits_and_wake_if(server, credits, 0); - break; - } - - - wdata->page_offset = start; - wdata->tailsz = - nr_pages > 1 ? - cur_len - (PAGE_SIZE - start) - - (nr_pages - 2) * PAGE_SIZE : - cur_len; - } else { - nr_pages = get_numpages(wsize, len, &cur_len); - wdata = cifs_writedata_alloc(nr_pages, - cifs_uncached_writev_complete); - if (!wdata) { - rc = -ENOMEM; - add_credits_and_wake_if(server, credits, 0); - break; - } - - rc = cifs_write_allocate_pages(wdata->pages, nr_pages); - if (rc) { - kvfree(wdata->pages); - kfree(wdata); - add_credits_and_wake_if(server, credits, 0); - break; - } - - num_pages = nr_pages; - rc = wdata_fill_from_iovec( - wdata, from, &cur_len, &num_pages); - if (rc) { - for (i = 0; i < nr_pages; i++) - put_page(wdata->pages[i]); - kvfree(wdata->pages); - kfree(wdata); - add_credits_and_wake_if(server, credits, 0); - break; - } + max_len = min_t(const size_t, len, wsize); + if (!max_len) { + rc = -EAGAIN; + add_credits_and_wake_if(server, credits, 0); + break; + } - /* - * Bring nr_pages down to the number of pages we - * actually used, and free any pages that we didn't use. - */ - for ( ; nr_pages > num_pages; nr_pages--) - put_page(wdata->pages[nr_pages - 1]); + cur_len = cifs_limit_bvec_subset(from, max_len, max_segs, &nsegs); + cifs_dbg(FYI, "write_from_iter len=%zx/%zx nsegs=%u/%lu/%u\n", + cur_len, max_len, nsegs, from->nr_segs, max_segs); + if (cur_len == 0) { + rc = -EIO; + add_credits_and_wake_if(server, credits, 0); + break; + } - wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); + wdata = cifs_writedata_alloc(cifs_uncached_writev_complete); + if (!wdata) { + rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); + break; } wdata->sync_mode = WB_SYNC_ALL; - wdata->nr_pages = nr_pages; - wdata->offset = (__u64)offset; - wdata->cfile = cifsFileInfo_get(open_file); - wdata->server = server; - wdata->pid = pid; - wdata->bytes = cur_len; - wdata->pagesz = PAGE_SIZE; - wdata->credits = credits_on_stack; - wdata->ctx = ctx; + wdata->offset = (__u64)fpos; + wdata->cfile = cifsFileInfo_get(open_file); + wdata->server = server; + wdata->pid = pid; + wdata->bytes = cur_len; + wdata->credits = credits_on_stack; + wdata->iter = *from; + wdata->ctx = ctx; kref_get(&ctx->refcount); + iov_iter_truncate(&wdata->iter, cur_len); + rc = adjust_credits(server, &wdata->credits, wdata->bytes); if (!rc) { @@ -3392,16 +3457,14 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, add_credits_and_wake_if(server, &wdata->credits, 0); kref_put(&wdata->refcount, cifs_uncached_writedata_release); - if (rc == -EAGAIN) { - *from = saved_from; - iov_iter_advance(from, offset - saved_offset); + if (rc == -EAGAIN) continue; - } break; } list_add_tail(&wdata->list, wdata_list); - offset += cur_len; + iov_iter_advance(from, cur_len); + fpos += cur_len; len -= cur_len; } while (len > 0); @@ -3500,20 +3563,8 @@ static ssize_t __cifs_writev( struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; struct cifs_aio_ctx *ctx; - struct iov_iter saved_from = *from; - size_t len = iov_iter_count(from); int rc; - /* - * iov_iter_get_pages_alloc doesn't work with ITER_KVEC. - * In this case, fall back to non-direct write function. - * this could be improved by getting pages directly in ITER_KVEC - */ - if (direct && iov_iter_is_kvec(from)) { - cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n"); - direct = false; - } - rc = generic_write_checks(iocb, from); if (rc <= 0) return rc; @@ -3535,23 +3586,54 @@ static ssize_t __cifs_writev( ctx->iocb = iocb; ctx->pos = iocb->ki_pos; + ctx->direct_io = direct; + ctx->nr_pinned_pages = 0; - if (direct) { - ctx->direct_io = true; - ctx->iter = *from; - ctx->len = len; - } else { - rc = setup_aio_ctx_iter(ctx, from, ITER_SOURCE); - if (rc) { + if (user_backed_iter(from)) { + /* + * Extract IOVEC/UBUF-type iterators to a BVEC-type iterator as + * they contain references to the calling process's virtual + * memory layout which won't be available in an async worker + * thread. This also takes a pin on every folio involved. + */ + rc = netfs_extract_user_iter(from, iov_iter_count(from), + &ctx->iter, 0); + if (rc < 0) { kref_put(&ctx->refcount, cifs_aio_ctx_release); return rc; } + + ctx->nr_pinned_pages = rc; + ctx->bv = (void *)ctx->iter.bvec; + ctx->bv_need_unpin = iov_iter_extract_will_pin(&ctx->iter); + } else if ((iov_iter_is_bvec(from) || iov_iter_is_kvec(from)) && + !is_sync_kiocb(iocb)) { + /* + * If the op is asynchronous, we need to copy the list attached + * to a BVEC/KVEC-type iterator, but we assume that the storage + * will be pinned by the caller; in any case, we may or may not + * be able to pin the pages, so we don't try. + */ + ctx->bv = (void *)dup_iter(&ctx->iter, from, GFP_KERNEL); + if (!ctx->bv) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -ENOMEM; + } + } else { + /* + * Otherwise, we just pass the iterator down as-is and rely on + * the caller to make sure the pages referred to by the + * iterator don't evaporate. + */ + ctx->iter = *from; } + ctx->len = iov_iter_count(&ctx->iter); + /* grab a lock here due to read response handlers can access ctx */ mutex_lock(&ctx->aio_mutex); - rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &saved_from, + rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &ctx->iter, cfile, cifs_sb, &ctx->list, ctx); /* @@ -3694,14 +3776,12 @@ out: return written; } -static struct cifs_readdata * -cifs_readdata_direct_alloc(struct page **pages, work_func_t complete) +static struct cifs_readdata *cifs_readdata_alloc(work_func_t complete) { struct cifs_readdata *rdata; rdata = kzalloc(sizeof(*rdata), GFP_KERNEL); - if (rdata != NULL) { - rdata->pages = pages; + if (rdata) { kref_init(&rdata->refcount); INIT_LIST_HEAD(&rdata->list); init_completion(&rdata->done); @@ -3711,27 +3791,14 @@ cifs_readdata_direct_alloc(struct page **pages, work_func_t complete) return rdata; } -static struct cifs_readdata * -cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete) -{ - struct page **pages = - kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - struct cifs_readdata *ret = NULL; - - if (pages) { - ret = cifs_readdata_direct_alloc(pages, complete); - if (!ret) - kfree(pages); - } - - return ret; -} - void cifs_readdata_release(struct kref *refcount) { struct cifs_readdata *rdata = container_of(refcount, struct cifs_readdata, refcount); + + if (rdata->ctx) + kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release); #ifdef CONFIG_CIFS_SMB_DIRECT if (rdata->mr) { smbd_deregister_mr(rdata->mr); @@ -3741,85 +3808,9 @@ cifs_readdata_release(struct kref *refcount) if (rdata->cfile) cifsFileInfo_put(rdata->cfile); - kvfree(rdata->pages); kfree(rdata); } -static int -cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages) -{ - int rc = 0; - struct page *page; - unsigned int i; - - for (i = 0; i < nr_pages; i++) { - page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!page) { - rc = -ENOMEM; - break; - } - rdata->pages[i] = page; - } - - if (rc) { - unsigned int nr_page_failed = i; - - for (i = 0; i < nr_page_failed; i++) { - put_page(rdata->pages[i]); - rdata->pages[i] = NULL; - } - } - return rc; -} - -static void -cifs_uncached_readdata_release(struct kref *refcount) -{ - struct cifs_readdata *rdata = container_of(refcount, - struct cifs_readdata, refcount); - unsigned int i; - - kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release); - for (i = 0; i < rdata->nr_pages; i++) { - put_page(rdata->pages[i]); - } - cifs_readdata_release(refcount); -} - -/** - * cifs_readdata_to_iov - copy data from pages in response to an iovec - * @rdata: the readdata response with list of pages holding data - * @iter: destination for our data - * - * This function copies data from a list of pages in a readdata response into - * an array of iovecs. It will first calculate where the data should go - * based on the info in the readdata and then copy the data into that spot. - */ -static int -cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) -{ - size_t remaining = rdata->got_bytes; - unsigned int i; - - for (i = 0; i < rdata->nr_pages; i++) { - struct page *page = rdata->pages[i]; - size_t copy = min_t(size_t, remaining, PAGE_SIZE); - size_t written; - - if (unlikely(iov_iter_is_pipe(iter))) { - void *addr = kmap_atomic(page); - - written = copy_to_iter(addr, copy, iter); - kunmap_atomic(addr); - } else - written = copy_page_to_iter(page, 0, copy, iter); - remaining -= written; - if (written < copy && iov_iter_count(iter) > 0) - break; - } - return remaining ? -EFAULT : 0; -} - static void collect_uncached_read_data(struct cifs_aio_ctx *ctx); static void @@ -3831,81 +3822,7 @@ cifs_uncached_readv_complete(struct work_struct *work) complete(&rdata->done); collect_uncached_read_data(rdata->ctx); /* the below call can possibly free the last ref to aio ctx */ - kref_put(&rdata->refcount, cifs_uncached_readdata_release); -} - -static int -uncached_fill_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, struct iov_iter *iter, - unsigned int len) -{ - int result = 0; - unsigned int i; - unsigned int nr_pages = rdata->nr_pages; - unsigned int page_offset = rdata->page_offset; - - rdata->got_bytes = 0; - rdata->tailsz = PAGE_SIZE; - for (i = 0; i < nr_pages; i++) { - struct page *page = rdata->pages[i]; - size_t n; - unsigned int segment_size = rdata->pagesz; - - if (i == 0) - segment_size -= page_offset; - else - page_offset = 0; - - - if (len <= 0) { - /* no need to hold page hostage */ - rdata->pages[i] = NULL; - rdata->nr_pages--; - put_page(page); - continue; - } - - n = len; - if (len >= segment_size) - /* enough data to fill the page */ - n = segment_size; - else - rdata->tailsz = len; - len -= n; - - if (iter) - result = copy_page_from_iter( - page, page_offset, n, iter); -#ifdef CONFIG_CIFS_SMB_DIRECT - else if (rdata->mr) - result = n; -#endif - else - result = cifs_read_page_from_socket( - server, page, page_offset, n); - if (result < 0) - break; - - rdata->got_bytes += result; - } - - return rdata->got_bytes > 0 && result != -ECONNABORTED ? - rdata->got_bytes : result; -} - -static int -cifs_uncached_read_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, unsigned int len) -{ - return uncached_fill_pages(server, rdata, NULL, len); -} - -static int -cifs_uncached_copy_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, - struct iov_iter *iter) -{ - return uncached_fill_pages(server, rdata, iter, iter->count); + kref_put(&rdata->refcount, cifs_readdata_release); } static int cifs_resend_rdata(struct cifs_readdata *rdata, @@ -3976,37 +3893,36 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata, } while (rc == -EAGAIN); fail: - kref_put(&rdata->refcount, cifs_uncached_readdata_release); + kref_put(&rdata->refcount, cifs_readdata_release); return rc; } static int -cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, +cifs_send_async_read(loff_t fpos, size_t len, struct cifsFileInfo *open_file, struct cifs_sb_info *cifs_sb, struct list_head *rdata_list, struct cifs_aio_ctx *ctx) { struct cifs_readdata *rdata; - unsigned int npages, rsize; + unsigned int rsize, nsegs, max_segs = INT_MAX; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; - size_t cur_len; + size_t cur_len, max_len; int rc; pid_t pid; struct TCP_Server_Info *server; - struct page **pagevec; - size_t start; - struct iov_iter direct_iov = ctx->iter; server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) + max_segs = server->smbd_conn->max_frmr_depth; +#endif + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid; - if (ctx->direct_io) - iov_iter_advance(&direct_iov, offset - ctx->pos); - do { if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); @@ -4026,78 +3942,37 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, if (rc) break; - cur_len = min_t(const size_t, len, rsize); - - if (ctx->direct_io) { - ssize_t result; - - result = iov_iter_get_pages_alloc2( - &direct_iov, &pagevec, - cur_len, &start); - if (result < 0) { - cifs_dbg(VFS, - "Couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", - result, iov_iter_type(&direct_iov), - direct_iov.iov_offset, - direct_iov.count); - dump_stack(); - - rc = result; - add_credits_and_wake_if(server, credits, 0); - break; - } - cur_len = (size_t)result; - - rdata = cifs_readdata_direct_alloc( - pagevec, cifs_uncached_readv_complete); - if (!rdata) { - add_credits_and_wake_if(server, credits, 0); - rc = -ENOMEM; - break; - } - - npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE; - rdata->page_offset = start; - rdata->tailsz = npages > 1 ? - cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE : - cur_len; - - } else { + max_len = min_t(size_t, len, rsize); - npages = DIV_ROUND_UP(cur_len, PAGE_SIZE); - /* allocate a readdata struct */ - rdata = cifs_readdata_alloc(npages, - cifs_uncached_readv_complete); - if (!rdata) { - add_credits_and_wake_if(server, credits, 0); - rc = -ENOMEM; - break; - } - - rc = cifs_read_allocate_pages(rdata, npages); - if (rc) { - kvfree(rdata->pages); - kfree(rdata); - add_credits_and_wake_if(server, credits, 0); - break; - } + cur_len = cifs_limit_bvec_subset(&ctx->iter, max_len, + max_segs, &nsegs); + cifs_dbg(FYI, "read-to-iter len=%zx/%zx nsegs=%u/%lu/%u\n", + cur_len, max_len, nsegs, ctx->iter.nr_segs, max_segs); + if (cur_len == 0) { + rc = -EIO; + add_credits_and_wake_if(server, credits, 0); + break; + } - rdata->tailsz = PAGE_SIZE; + rdata = cifs_readdata_alloc(cifs_uncached_readv_complete); + if (!rdata) { + add_credits_and_wake_if(server, credits, 0); + rc = -ENOMEM; + break; } - rdata->server = server; - rdata->cfile = cifsFileInfo_get(open_file); - rdata->nr_pages = npages; - rdata->offset = offset; - rdata->bytes = cur_len; - rdata->pid = pid; - rdata->pagesz = PAGE_SIZE; - rdata->read_into_pages = cifs_uncached_read_into_pages; - rdata->copy_into_pages = cifs_uncached_copy_into_pages; - rdata->credits = credits_on_stack; - rdata->ctx = ctx; + rdata->server = server; + rdata->cfile = cifsFileInfo_get(open_file); + rdata->offset = fpos; + rdata->bytes = cur_len; + rdata->pid = pid; + rdata->credits = credits_on_stack; + rdata->ctx = ctx; kref_get(&ctx->refcount); + rdata->iter = ctx->iter; + iov_iter_truncate(&rdata->iter, cur_len); + rc = adjust_credits(server, &rdata->credits, rdata->bytes); if (!rc) { @@ -4109,17 +3984,15 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, if (rc) { add_credits_and_wake_if(server, &rdata->credits, 0); - kref_put(&rdata->refcount, - cifs_uncached_readdata_release); - if (rc == -EAGAIN) { - iov_iter_revert(&direct_iov, cur_len); + kref_put(&rdata->refcount, cifs_readdata_release); + if (rc == -EAGAIN) continue; - } break; } list_add_tail(&rdata->list, rdata_list); - offset += cur_len; + iov_iter_advance(&ctx->iter, cur_len); + fpos += cur_len; len -= cur_len; } while (len > 0); @@ -4161,22 +4034,6 @@ again: list_del_init(&rdata->list); INIT_LIST_HEAD(&tmp_list); - /* - * Got a part of data and then reconnect has - * happened -- fill the buffer and continue - * reading. - */ - if (got_bytes && got_bytes < rdata->bytes) { - rc = 0; - if (!ctx->direct_io) - rc = cifs_readdata_to_iov(rdata, to); - if (rc) { - kref_put(&rdata->refcount, - cifs_uncached_readdata_release); - continue; - } - } - if (ctx->direct_io) { /* * Re-use rdata as this is a @@ -4193,7 +4050,7 @@ again: &tmp_list, ctx); kref_put(&rdata->refcount, - cifs_uncached_readdata_release); + cifs_readdata_release); } list_splice(&tmp_list, &ctx->list); @@ -4201,8 +4058,6 @@ again: goto again; } else if (rdata->result) rc = rdata->result; - else if (!ctx->direct_io) - rc = cifs_readdata_to_iov(rdata, to); /* if there was a short read -- discard anything left */ if (rdata->got_bytes && rdata->got_bytes < rdata->bytes) @@ -4211,7 +4066,7 @@ again: ctx->total_len += rdata->got_bytes; } list_del_init(&rdata->list); - kref_put(&rdata->refcount, cifs_uncached_readdata_release); + kref_put(&rdata->refcount, cifs_readdata_release); } if (!ctx->direct_io) @@ -4243,16 +4098,6 @@ static ssize_t __cifs_readv( loff_t offset = iocb->ki_pos; struct cifs_aio_ctx *ctx; - /* - * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC, - * fall back to data copy read path - * this could be improved by getting pages directly in ITER_KVEC - */ - if (direct && iov_iter_is_kvec(to)) { - cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n"); - direct = false; - } - len = iov_iter_count(to); if (!len) return 0; @@ -4271,26 +4116,53 @@ static ssize_t __cifs_readv( if (!ctx) return -ENOMEM; - ctx->cfile = cifsFileInfo_get(cfile); + ctx->pos = offset; + ctx->direct_io = direct; + ctx->len = len; + ctx->cfile = cifsFileInfo_get(cfile); + ctx->nr_pinned_pages = 0; if (!is_sync_kiocb(iocb)) ctx->iocb = iocb; - if (user_backed_iter(to)) - ctx->should_dirty = true; - - if (direct) { - ctx->pos = offset; - ctx->direct_io = true; - ctx->iter = *to; - ctx->len = len; - } else { - rc = setup_aio_ctx_iter(ctx, to, ITER_DEST); - if (rc) { + if (user_backed_iter(to)) { + /* + * Extract IOVEC/UBUF-type iterators to a BVEC-type iterator as + * they contain references to the calling process's virtual + * memory layout which won't be available in an async worker + * thread. This also takes a pin on every folio involved. + */ + rc = netfs_extract_user_iter(to, iov_iter_count(to), + &ctx->iter, 0); + if (rc < 0) { kref_put(&ctx->refcount, cifs_aio_ctx_release); return rc; } - len = ctx->len; + + ctx->nr_pinned_pages = rc; + ctx->bv = (void *)ctx->iter.bvec; + ctx->bv_need_unpin = iov_iter_extract_will_pin(&ctx->iter); + ctx->should_dirty = true; + } else if ((iov_iter_is_bvec(to) || iov_iter_is_kvec(to)) && + !is_sync_kiocb(iocb)) { + /* + * If the op is asynchronous, we need to copy the list attached + * to a BVEC/KVEC-type iterator, but we assume that the storage + * will be retained by the caller; in any case, we may or may + * not be able to pin the pages, so we don't try. + */ + ctx->bv = (void *)dup_iter(&ctx->iter, to, GFP_KERNEL); + if (!ctx->bv) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -ENOMEM; + } + } else { + /* + * Otherwise, we just pass the iterator down as-is and rely on + * the caller to make sure the pages referred to by the + * iterator don't evaporate. + */ + ctx->iter = *to; } if (direct) { @@ -4489,23 +4361,22 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) * If the page is mmap'ed into a process' page tables, then we need to make * sure that it doesn't change while being written back. */ -static vm_fault_t -cifs_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); - /* Wait for the page to be written to the cache before we allow it to - * be modified. We then assume the entire page will need writing back. + /* Wait for the folio to be written to the cache before we allow it to + * be modified. We then assume the entire folio will need writing back. */ #ifdef CONFIG_CIFS_FSCACHE - if (PageFsCache(page) && - wait_on_page_fscache_killable(page) < 0) + if (folio_test_fscache(folio) && + folio_wait_fscache_killable(folio) < 0) return VM_FAULT_RETRY; #endif - wait_on_page_writeback(page); + folio_wait_writeback(folio); - if (lock_page_killable(page) < 0) + if (folio_lock_killable(folio) < 0) return VM_FAULT_RETRY; return VM_FAULT_LOCKED; } @@ -4553,149 +4424,72 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) return rc; } -static void -cifs_readv_complete(struct work_struct *work) +/* + * Unlock a bunch of folios in the pagecache. + */ +static void cifs_unlock_folios(struct address_space *mapping, pgoff_t first, pgoff_t last) { - unsigned int i, got_bytes; - struct cifs_readdata *rdata = container_of(work, - struct cifs_readdata, work); + struct folio *folio; + XA_STATE(xas, &mapping->i_pages, first); - got_bytes = rdata->got_bytes; - for (i = 0; i < rdata->nr_pages; i++) { - struct page *page = rdata->pages[i]; - - if (rdata->result == 0 || - (rdata->result == -EAGAIN && got_bytes)) { - flush_dcache_page(page); - SetPageUptodate(page); - } else - SetPageError(page); - - if (rdata->result == 0 || - (rdata->result == -EAGAIN && got_bytes)) - cifs_readpage_to_fscache(rdata->mapping->host, page); - - unlock_page(page); - - got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes); - - put_page(page); - rdata->pages[i] = NULL; + rcu_read_lock(); + xas_for_each(&xas, folio, last) { + folio_unlock(folio); } - kref_put(&rdata->refcount, cifs_readdata_release); + rcu_read_unlock(); } -static int -readpages_fill_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, struct iov_iter *iter, - unsigned int len) +static void cifs_readahead_complete(struct work_struct *work) { - int result = 0; - unsigned int i; - u64 eof; - pgoff_t eof_index; - unsigned int nr_pages = rdata->nr_pages; - unsigned int page_offset = rdata->page_offset; - - /* determine the eof that the server (probably) has */ - eof = CIFS_I(rdata->mapping->host)->server_eof; - eof_index = eof ? (eof - 1) >> PAGE_SHIFT : 0; - cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index); - - rdata->got_bytes = 0; - rdata->tailsz = PAGE_SIZE; - for (i = 0; i < nr_pages; i++) { - struct page *page = rdata->pages[i]; - unsigned int to_read = rdata->pagesz; - size_t n; - - if (i == 0) - to_read -= page_offset; - else - page_offset = 0; - - n = to_read; - - if (len >= to_read) { - len -= to_read; - } else if (len > 0) { - /* enough for partial page, fill and zero the rest */ - zero_user(page, len + page_offset, to_read - len); - n = rdata->tailsz = len; - len = 0; - } else if (page->index > eof_index) { - /* - * The VFS will not try to do readahead past the - * i_size, but it's possible that we have outstanding - * writes with gaps in the middle and the i_size hasn't - * caught up yet. Populate those with zeroed out pages - * to prevent the VFS from repeatedly attempting to - * fill them until the writes are flushed. - */ - zero_user(page, 0, PAGE_SIZE); - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - put_page(page); - rdata->pages[i] = NULL; - rdata->nr_pages--; - continue; - } else { - /* no need to hold page hostage */ - unlock_page(page); - put_page(page); - rdata->pages[i] = NULL; - rdata->nr_pages--; - continue; - } + struct cifs_readdata *rdata = container_of(work, + struct cifs_readdata, work); + struct folio *folio; + pgoff_t last; + bool good = rdata->result == 0 || (rdata->result == -EAGAIN && rdata->got_bytes); - if (iter) - result = copy_page_from_iter( - page, page_offset, n, iter); -#ifdef CONFIG_CIFS_SMB_DIRECT - else if (rdata->mr) - result = n; -#endif - else - result = cifs_read_page_from_socket( - server, page, page_offset, n); - if (result < 0) - break; + XA_STATE(xas, &rdata->mapping->i_pages, rdata->offset / PAGE_SIZE); - rdata->got_bytes += result; - } + if (good) + cifs_readahead_to_fscache(rdata->mapping->host, + rdata->offset, rdata->bytes); - return rdata->got_bytes > 0 && result != -ECONNABORTED ? - rdata->got_bytes : result; -} + if (iov_iter_count(&rdata->iter) > 0) + iov_iter_zero(iov_iter_count(&rdata->iter), &rdata->iter); -static int -cifs_readpages_read_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, unsigned int len) -{ - return readpages_fill_pages(server, rdata, NULL, len); -} + last = (rdata->offset + rdata->bytes - 1) / PAGE_SIZE; -static int -cifs_readpages_copy_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, - struct iov_iter *iter) -{ - return readpages_fill_pages(server, rdata, iter, iter->count); + rcu_read_lock(); + xas_for_each(&xas, folio, last) { + if (good) { + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + folio_unlock(folio); + } + rcu_read_unlock(); + + kref_put(&rdata->refcount, cifs_readdata_release); } static void cifs_readahead(struct readahead_control *ractl) { - int rc; struct cifsFileInfo *open_file = ractl->file->private_data; struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file); struct TCP_Server_Info *server; - pid_t pid; - unsigned int xid, nr_pages, last_batch_size = 0, cache_nr_pages = 0; - pgoff_t next_cached = ULONG_MAX; + unsigned int xid, nr_pages, cache_nr_pages = 0; + unsigned int ra_pages; + pgoff_t next_cached = ULONG_MAX, ra_index; bool caching = fscache_cookie_enabled(cifs_inode_cookie(ractl->mapping->host)) && cifs_inode_cookie(ractl->mapping->host)->cache_priv; bool check_cache = caching; + pid_t pid; + int rc = 0; + + /* Note that readahead_count() lags behind our dequeuing of pages from + * the ractl, wo we have to keep track for ourselves. + */ + ra_pages = readahead_count(ractl); + ra_index = readahead_index(ractl); xid = get_xid(); @@ -4704,22 +4498,21 @@ static void cifs_readahead(struct readahead_control *ractl) else pid = current->tgid; - rc = 0; server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", - __func__, ractl->file, ractl->mapping, readahead_count(ractl)); + __func__, ractl->file, ractl->mapping, ra_pages); /* * Chop the readahead request up into rsize-sized read requests. */ - while ((nr_pages = readahead_count(ractl) - last_batch_size)) { - unsigned int i, got, rsize; - struct page *page; + while ((nr_pages = ra_pages)) { + unsigned int i, rsize; struct cifs_readdata *rdata; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; - pgoff_t index = readahead_index(ractl) + last_batch_size; + struct folio *folio; + pgoff_t fsize; /* * Find out if we have anything cached in the range of @@ -4728,21 +4521,22 @@ static void cifs_readahead(struct readahead_control *ractl) if (caching) { if (check_cache) { rc = cifs_fscache_query_occupancy( - ractl->mapping->host, index, nr_pages, + ractl->mapping->host, ra_index, nr_pages, &next_cached, &cache_nr_pages); if (rc < 0) caching = false; check_cache = false; } - if (index == next_cached) { + if (ra_index == next_cached) { /* * TODO: Send a whole batch of pages to be read * by the cache. */ - struct folio *folio = readahead_folio(ractl); - - last_batch_size = folio_nr_pages(folio); + folio = readahead_folio(ractl); + fsize = folio_nr_pages(folio); + ra_pages -= fsize; + ra_index += fsize; if (cifs_readpage_from_fscache(ractl->mapping->host, &folio->page) < 0) { /* @@ -4753,8 +4547,8 @@ static void cifs_readahead(struct readahead_control *ractl) caching = false; } folio_unlock(folio); - next_cached++; - cache_nr_pages--; + next_cached += fsize; + cache_nr_pages -= fsize; if (cache_nr_pages == 0) check_cache = true; continue; @@ -4779,8 +4573,9 @@ static void cifs_readahead(struct readahead_control *ractl) &rsize, credits); if (rc) break; - nr_pages = min_t(size_t, rsize / PAGE_SIZE, readahead_count(ractl)); - nr_pages = min_t(size_t, nr_pages, next_cached - index); + nr_pages = min_t(size_t, rsize / PAGE_SIZE, ra_pages); + if (next_cached != ULONG_MAX) + nr_pages = min_t(size_t, nr_pages, next_cached - ra_index); /* * Give up immediately if rsize is too small to read an entire @@ -4793,33 +4588,31 @@ static void cifs_readahead(struct readahead_control *ractl) break; } - rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete); + rdata = cifs_readdata_alloc(cifs_readahead_complete); if (!rdata) { /* best to give up if we're out of mem */ add_credits_and_wake_if(server, credits, 0); break; } - got = __readahead_batch(ractl, rdata->pages, nr_pages); - if (got != nr_pages) { - pr_warn("__readahead_batch() returned %u/%u\n", - got, nr_pages); - nr_pages = got; - } - - rdata->nr_pages = nr_pages; - rdata->bytes = readahead_batch_length(ractl); + rdata->offset = ra_index * PAGE_SIZE; + rdata->bytes = nr_pages * PAGE_SIZE; rdata->cfile = cifsFileInfo_get(open_file); rdata->server = server; rdata->mapping = ractl->mapping; - rdata->offset = readahead_pos(ractl); rdata->pid = pid; - rdata->pagesz = PAGE_SIZE; - rdata->tailsz = PAGE_SIZE; - rdata->read_into_pages = cifs_readpages_read_into_pages; - rdata->copy_into_pages = cifs_readpages_copy_into_pages; rdata->credits = credits_on_stack; + for (i = 0; i < nr_pages; i++) { + if (!readahead_folio(ractl)) + WARN_ON(1); + } + ra_pages -= nr_pages; + ra_index += nr_pages; + + iov_iter_xarray(&rdata->iter, ITER_DEST, &rdata->mapping->i_pages, + rdata->offset, rdata->bytes); + rc = adjust_credits(server, &rdata->credits, rdata->bytes); if (!rc) { if (rdata->cfile->invalidHandle) @@ -4830,18 +4623,15 @@ static void cifs_readahead(struct readahead_control *ractl) if (rc) { add_credits_and_wake_if(server, &rdata->credits, 0); - for (i = 0; i < rdata->nr_pages; i++) { - page = rdata->pages[i]; - unlock_page(page); - put_page(page); - } + cifs_unlock_folios(rdata->mapping, + rdata->offset / PAGE_SIZE, + (rdata->offset + rdata->bytes - 1) / PAGE_SIZE); /* Fallback to the readpage in error/reconnect cases */ kref_put(&rdata->refcount, cifs_readdata_release); break; } kref_put(&rdata->refcount, cifs_readdata_release); - last_batch_size = nr_pages; } free_xid(xid); @@ -4883,10 +4673,6 @@ static int cifs_readpage_worker(struct file *file, struct page *page, flush_dcache_page(page); SetPageUptodate(page); - - /* send this page to the cache */ - cifs_readpage_to_fscache(file_inode(file), page); - rc = 0; io_error: @@ -5273,3 +5059,19 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .launder_folio = cifs_launder_folio, .migrate_folio = filemap_migrate_folio, }; + +/* + * Splice data from a file into a pipe. + */ +ssize_t cifs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + if (unlikely(*ppos >= file_inode(in)->i_sb->s_maxbytes)) + return 0; + if (unlikely(!len)) + return 0; + if (in->f_flags & O_DIRECT) + return direct_splice_read(in, ppos, pipe, len, flags); + return filemap_splice_read(in, ppos, pipe, len, flags); +} diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index f6f3a6b75601..8f6909d633da 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -143,14 +143,12 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) struct netfs_cache_resources cres; struct fscache_cookie *cookie = cifs_inode_cookie(inode); struct iov_iter iter; - struct bio_vec bvec[1]; + struct bio_vec bvec; int ret; memset(&cres, 0, sizeof(cres)); - bvec[0].bv_page = page; - bvec[0].bv_offset = 0; - bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + bvec_set_page(&bvec, page, PAGE_SIZE, 0); + iov_iter_bvec(&iter, ITER_DEST, &bvec, 1, PAGE_SIZE); ret = fscache_begin_read_operation(&cres, cookie); if (ret < 0) @@ -165,22 +163,16 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) /* * Fallback page writing interface. */ -static int fscache_fallback_write_page(struct inode *inode, struct page *page, - bool no_space_allocated_yet) +static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_t len, + bool no_space_allocated_yet) { struct netfs_cache_resources cres; struct fscache_cookie *cookie = cifs_inode_cookie(inode); struct iov_iter iter; - struct bio_vec bvec[1]; - loff_t start = page_offset(page); - size_t len = PAGE_SIZE; int ret; memset(&cres, 0, sizeof(cres)); - bvec[0].bv_page = page; - bvec[0].bv_offset = 0; - bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_xarray(&iter, ITER_SOURCE, &inode->i_mapping->i_pages, start, len); ret = fscache_begin_write_operation(&cres, cookie); if (ret < 0) @@ -189,7 +181,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), no_space_allocated_yet); if (ret == 0) - ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL); + ret = fscache_write(&cres, start, &iter, NULL, NULL); fscache_end_operation(&cres); return ret; } @@ -213,12 +205,12 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) return 0; } -void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) +void __cifs_readahead_to_fscache(struct inode *inode, loff_t pos, size_t len) { - cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", - __func__, cifs_inode_cookie(inode), page, inode); + cifs_dbg(FYI, "%s: (fsc: %p, p: %llx, l: %zx, i: %p)\n", + __func__, cifs_inode_cookie(inode), pos, len, inode); - fscache_fallback_write_page(inode, page, true); + fscache_fallback_write_pages(inode, pos, len, true); } /* diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 67b601041f0a..173999610997 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -90,7 +90,7 @@ static inline int cifs_fscache_query_occupancy(struct inode *inode, } extern int __cifs_readpage_from_fscache(struct inode *pinode, struct page *ppage); -extern void __cifs_readpage_to_fscache(struct inode *pinode, struct page *ppage); +extern void __cifs_readahead_to_fscache(struct inode *pinode, loff_t pos, size_t len); static inline int cifs_readpage_from_fscache(struct inode *inode, @@ -101,11 +101,11 @@ static inline int cifs_readpage_from_fscache(struct inode *inode, return -ENOBUFS; } -static inline void cifs_readpage_to_fscache(struct inode *inode, - struct page *page) +static inline void cifs_readahead_to_fscache(struct inode *inode, + loff_t pos, size_t len) { if (cifs_inode_cookie(inode)) - __cifs_readpage_to_fscache(inode, page); + __cifs_readahead_to_fscache(inode, pos, len); } #else /* CONFIG_CIFS_FSCACHE */ @@ -141,7 +141,7 @@ cifs_readpage_from_fscache(struct inode *inode, struct page *page) } static inline -void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {} +void cifs_readahead_to_fscache(struct inode *inode, loff_t pos, size_t len) {} #endif /* CONFIG_CIFS_FSCACHE */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f145a59af89b..1087ac6104a9 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -508,14 +508,15 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_READ; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_READ, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; @@ -1518,14 +1519,15 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, goto out; } - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = DELETE | FILE_WRITE_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc != 0) @@ -1910,7 +1912,7 @@ posix_mkdir_get_info: } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, +int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, umode_t mode) { int rc = 0; @@ -2112,15 +2114,16 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (to_dentry->d_parent != from_dentry->d_parent) goto do_rename_exit; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - /* open the file to be renamed -- we need DELETE perms */ - oparms.desired_access = DELETE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = from_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + /* open the file to be renamed -- we need DELETE perms */ + .desired_access = DELETE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = from_path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc == 0) { @@ -2138,7 +2141,7 @@ do_rename_exit: } int -cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, +cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, struct dentry *source_dentry, struct inode *target_dir, struct dentry *target_dentry, unsigned int flags) { @@ -2496,7 +2499,7 @@ int cifs_revalidate_dentry(struct dentry *dentry) return cifs_revalidate_mapping(inode); } -int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int cifs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -2537,7 +2540,7 @@ int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, return rc; } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->blksize = cifs_sb->ctx->bsize; stat->ino = CIFS_I(inode)->uniqueid; @@ -2752,7 +2755,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = setattr_prepare(&init_user_ns, direntry, attrs); + rc = setattr_prepare(&nop_mnt_idmap, direntry, attrs); if (rc < 0) goto out; @@ -2859,7 +2862,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } - setattr_copy(&init_user_ns, inode, attrs); + setattr_copy(&nop_mnt_idmap, inode, attrs); mark_inode_dirty(inode); /* force revalidate when any of these times are set since some @@ -2903,7 +2906,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = setattr_prepare(&init_user_ns, direntry, attrs); + rc = setattr_prepare(&nop_mnt_idmap, direntry, attrs); if (rc < 0) goto cifs_setattr_exit; @@ -3058,7 +3061,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } - setattr_copy(&init_user_ns, inode, attrs); + setattr_copy(&nop_mnt_idmap, inode, attrs); mark_inode_dirty(inode); cifs_setattr_exit: @@ -3068,7 +3071,7 @@ cifs_setattr_exit: } int -cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, +cifs_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index a5a097a69983..7d97c10f2453 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -271,14 +271,15 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, int buf_type = CIFS_NO_BUFFER; FILE_ALL_INFO file_info; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_READ; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_READ, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, &file_info); if (rc) @@ -313,14 +314,15 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {0}; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_CREATE; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_CREATE, + .path = path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) @@ -355,13 +357,14 @@ smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct smb2_file_all_info *pfile_info = NULL; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_READ; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_READ, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .fid = &fid, + }; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (utf16_path == NULL) @@ -421,14 +424,15 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, if (!utf16_path) return -ENOMEM; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_CREATE; - oparms.fid = &fid; - oparms.reconnect = false; - oparms.mode = 0644; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_CREATE, + .fid = &fid, + .mode = 0644, + }; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, NULL); @@ -569,7 +573,7 @@ cifs_hl_exit: } int -cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, +cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, const char *symname) { int rc = -EOPNOTSUPP; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 2a19c7987c5b..2905734eb289 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -966,16 +966,22 @@ cifs_aio_ctx_release(struct kref *refcount) /* * ctx->bv is only set if setup_aio_ctx_iter() was call successfuly - * which means that iov_iter_get_pages() was a success and thus that - * we have taken reference on pages. + * which means that iov_iter_extract_pages() was a success and thus + * that we may have references or pins on pages that we need to + * release. */ if (ctx->bv) { - unsigned i; + if (ctx->should_dirty || ctx->bv_need_unpin) { + unsigned int i; - for (i = 0; i < ctx->npages; i++) { - if (ctx->should_dirty) - set_page_dirty(ctx->bv[i].bv_page); - put_page(ctx->bv[i].bv_page); + for (i = 0; i < ctx->nr_pinned_pages; i++) { + struct page *page = ctx->bv[i].bv_page; + + if (ctx->should_dirty) + set_page_dirty(page); + if (ctx->bv_need_unpin) + unpin_user_page(page); + } } kvfree(ctx->bv); } @@ -983,95 +989,6 @@ cifs_aio_ctx_release(struct kref *refcount) kfree(ctx); } -#define CIFS_AIO_KMALLOC_LIMIT (1024 * 1024) - -int -setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) -{ - ssize_t rc; - unsigned int cur_npages; - unsigned int npages = 0; - unsigned int i; - size_t len; - size_t count = iov_iter_count(iter); - unsigned int saved_len; - size_t start; - unsigned int max_pages = iov_iter_npages(iter, INT_MAX); - struct page **pages = NULL; - struct bio_vec *bv = NULL; - - if (iov_iter_is_kvec(iter)) { - memcpy(&ctx->iter, iter, sizeof(*iter)); - ctx->len = count; - iov_iter_advance(iter, count); - return 0; - } - - if (array_size(max_pages, sizeof(*bv)) <= CIFS_AIO_KMALLOC_LIMIT) - bv = kmalloc_array(max_pages, sizeof(*bv), GFP_KERNEL); - - if (!bv) { - bv = vmalloc(array_size(max_pages, sizeof(*bv))); - if (!bv) - return -ENOMEM; - } - - if (array_size(max_pages, sizeof(*pages)) <= CIFS_AIO_KMALLOC_LIMIT) - pages = kmalloc_array(max_pages, sizeof(*pages), GFP_KERNEL); - - if (!pages) { - pages = vmalloc(array_size(max_pages, sizeof(*pages))); - if (!pages) { - kvfree(bv); - return -ENOMEM; - } - } - - saved_len = count; - - while (count && npages < max_pages) { - rc = iov_iter_get_pages2(iter, pages, count, max_pages, &start); - if (rc < 0) { - cifs_dbg(VFS, "Couldn't get user pages (rc=%zd)\n", rc); - break; - } - - if (rc > count) { - cifs_dbg(VFS, "get pages rc=%zd more than %zu\n", rc, - count); - break; - } - - count -= rc; - rc += start; - cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE); - - if (npages + cur_npages > max_pages) { - cifs_dbg(VFS, "out of vec array capacity (%u vs %u)\n", - npages + cur_npages, max_pages); - break; - } - - for (i = 0; i < cur_npages; i++) { - len = rc > PAGE_SIZE ? PAGE_SIZE : rc; - bv[npages + i].bv_page = pages[i]; - bv[npages + i].bv_offset = start; - bv[npages + i].bv_len = len - start; - rc -= len; - start = 0; - } - - npages += cur_npages; - } - - kvfree(pages); - ctx->bv = bv; - ctx->len = saved_len - count; - ctx->npages = npages; - iov_iter_bvec(&ctx->iter, rw, ctx->bv, npages, ctx->len); - return 0; -} - /** * cifs_alloc_hash - allocate hash and hash context together * @name: The name of the crypto hash algo @@ -1129,25 +1046,6 @@ cifs_free_hash(struct shash_desc **sdesc) *sdesc = NULL; } -/** - * rqst_page_get_length - obtain the length and offset for a page in smb_rqst - * @rqst: The request descriptor - * @page: The index of the page to query - * @len: Where to store the length for this page: - * @offset: Where to store the offset for this page - */ -void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page, - unsigned int *len, unsigned int *offset) -{ - *len = rqst->rq_pagesz; - *offset = (page == 0) ? rqst->rq_offset : 0; - - if (rqst->rq_npages == 1 || page == rqst->rq_npages-1) - *len = rqst->rq_tailsz; - else if (page == 0) - *len = rqst->rq_pagesz - rqst->rq_offset; -} - void extract_unc_hostname(const char *unc, const char **h, size_t *len) { const char *end; diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 55758b9ec877..2c5dde2ece58 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -83,7 +83,7 @@ typedef struct _NEGOTIATE_MESSAGE { SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */ /* SECURITY_BUFFER for version info not present since we do not set the version is present flag */ - char DomainString[0]; + char DomainString[]; /* followed by WorkstationString */ } __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE; @@ -135,7 +135,7 @@ typedef struct _AUTHENTICATE_MESSAGE { __le32 NegotiateFlags; /* SECURITY_BUFFER for version info not present since we do not set the version is present flag */ - char UserString[0]; + char UserString[]; } __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE; /* diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 2d75ba5aaa8a..ef638086d734 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -495,7 +495,7 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) FIND_FILE_STANDARD_INFO *pfData; pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo; - new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) + + new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) + 1 + pfData->FileNameLength; } else { u32 next_offset = le32_to_cpu(pDirInfo->NextEntryOffset); @@ -513,9 +513,9 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) new_entry, end_of_smb, old_entry); return NULL; } else if (((level == SMB_FIND_FILE_INFO_STANDARD) && - (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) + (new_entry + sizeof(FIND_FILE_STANDARD_INFO) + 1 > end_of_smb)) || ((level != SMB_FIND_FILE_INFO_STANDARD) && - (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) { + (new_entry + sizeof(FILE_DIRECTORY_INFO) + 1 > end_of_smb))) { cifs_dbg(VFS, "search entry %p extends after end of SMB %p\n", new_entry, end_of_smb); return NULL; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index c47b254f0d1e..d2cbae4b5d21 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -480,7 +480,6 @@ out: * remove this channel */ cancel_delayed_work_sync(&chan->server->echo); - cancel_delayed_work_sync(&chan->server->resolve); cancel_delayed_work_sync(&chan->server->reconnect); spin_lock(&ses->chan_lock); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 4cb364454e13..abda6148be10 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -576,14 +576,15 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if (!(le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) return 0; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = FILE_READ_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; /* Need to check if this is a symbolic link or not */ tmprc = CIFS_open(xid, &oparms, &oplock, NULL); @@ -823,14 +824,15 @@ smb_set_file_info(struct inode *inode, const char *full_path, goto out; } - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); rc = CIFS_open(xid, &oparms, &oplock, NULL); @@ -998,15 +1000,16 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, goto out; } - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.create_options = cifs_create_options(cifs_sb, - OPEN_REPARSE_POINT); - oparms.disposition = FILE_OPEN; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = FILE_READ_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, + OPEN_REPARSE_POINT), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) @@ -1115,15 +1118,16 @@ cifs_make_node(unsigned int xid, struct inode *inode, cifs_dbg(FYI, "sfu compat create special file\n"); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | - CREATE_OPTION_SPECIAL); - oparms.disposition = FILE_CREATE; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | + CREATE_OPTION_SPECIAL), + .disposition = FILE_CREATE, + .path = full_path, + .fid = &fid, + }; if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index ba6cc50af390..e0ee96d69d49 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -7,6 +7,7 @@ * */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/pagemap.h> @@ -34,7 +35,7 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) len = (u32)err->ErrorContextCount * (offsetof(struct smb2_error_context_rsp, ErrorContextData) + sizeof(struct smb2_symlink_err_rsp)); - if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err)) + if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err) + 1) return ERR_PTR(-EINVAL); p = (struct smb2_error_context_rsp *)err->ErrorData; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 8521adf9ce79..37b4cd59245d 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -105,14 +105,15 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, goto finished; } - vars->oparms.tcon = tcon; - vars->oparms.desired_access = desired_access; - vars->oparms.disposition = create_disposition; - vars->oparms.create_options = cifs_create_options(cifs_sb, create_options); - vars->oparms.fid = &fid; - vars->oparms.reconnect = false; - vars->oparms.mode = mode; - vars->oparms.cifs_sb = cifs_sb; + vars->oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = desired_access, + .disposition = create_disposition, + .create_options = cifs_create_options(cifs_sb, create_options), + .fid = &fid, + .mode = mode, + .cifs_sb = cifs_sb, + }; rqst[num_rqst].rq_iov = &vars->open_iov[0]; rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 572293c18e16..3935a60db5c3 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -113,7 +113,7 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, } else if (nc_offset + 1 == non_ctxlen) { cifs_dbg(FYI, "no SPNEGO security blob in negprot rsp\n"); size_of_pad_before_neg_ctxts = 0; - } else if (non_ctxlen == SMB311_NEGPROT_BASE_SIZE) + } else if (non_ctxlen == SMB311_NEGPROT_BASE_SIZE + 1) /* has padding, but no SPNEGO blob */ size_of_pad_before_neg_ctxts = nc_offset - non_ctxlen + 1; else diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index e6bcd2baf446..f79b075f2992 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -729,12 +729,13 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; struct cached_fid *cfid = NULL; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = open_cached_dir(xid, tcon, "", cifs_sb, false, &cfid); if (rc == 0) @@ -771,12 +772,13 @@ smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL, NULL); @@ -816,12 +818,13 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, if (!utf16_path) return -ENOMEM; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, &err_iov, &err_buftype); @@ -1097,13 +1100,13 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - memset(&oparms, 0, sizeof(oparms)); - oparms.tcon = tcon; - oparms.desired_access = FILE_WRITE_EA; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_WRITE_EA, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -1453,12 +1456,12 @@ smb2_ioctl_query_info(const unsigned int xid, rqst[0].rq_iov = &vars->open_iov[0]; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - memset(&oparms, 0, sizeof(oparms)); - oparms.tcon = tcon; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, create_options), + .fid = &fid, + }; if (qi.flags & PASSTHRU_FSCTL) { switch (qi.info_type & FSCTL_DEVICE_ACCESS_MASK) { @@ -2088,12 +2091,13 @@ smb3_notify(const unsigned int xid, struct file *pfile, } tcon = cifs_sb_master_tcon(cifs_sb); - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, NULL); @@ -2159,12 +2163,13 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -2490,12 +2495,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - oparms.tcon = tcon; - oparms.desired_access = desired_access; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = desired_access, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -2623,12 +2629,13 @@ smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon, if (!tcon->posix_extensions) return smb2_queryfs(xid, tcon, cifs_sb, buf); - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL, NULL); @@ -2916,13 +2923,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - memset(&oparms, 0, sizeof(oparms)); - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, create_options), + .fid = &fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -3056,13 +3063,13 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - memset(&oparms, 0, sizeof(oparms)); - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT), + .fid = &fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -3196,17 +3203,20 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, return ERR_PTR(rc); } - oparms.tcon = tcon; - oparms.desired_access = READ_CONTROL; - oparms.disposition = FILE_OPEN; - /* - * When querying an ACL, even if the file is a symlink we want to open - * the source not the target, and so the protocol requires that the - * client specify this flag when opening a reparse point - */ - oparms.create_options = cifs_create_options(cifs_sb, 0) | OPEN_REPARSE_POINT; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = READ_CONTROL, + .disposition = FILE_OPEN, + /* + * When querying an ACL, even if the file is a symlink + * we want to open the source not the target, and so + * the protocol requires that the client specify this + * flag when opening a reparse point + */ + .create_options = cifs_create_options(cifs_sb, 0) | + OPEN_REPARSE_POINT, + .fid = &fid, + }; if (info & SACL_SECINFO) oparms.desired_access |= SYSTEM_SECURITY; @@ -3265,13 +3275,14 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, return rc; } - oparms.tcon = tcon; - oparms.desired_access = access_flags; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = access_flags, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, NULL); @@ -4227,8 +4238,8 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst, int num_rqst, const u8 *sig, u8 **iv, - struct aead_request **req, struct scatterlist **sgl, - unsigned int *num_sgs) + struct aead_request **req, struct sg_table *sgt, + unsigned int *num_sgs, size_t *sensitive_size) { unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm); unsigned int iv_size = crypto_aead_ivsize(tfm); @@ -4236,70 +4247,75 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst u8 *p; *num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig); + if (IS_ERR_VALUE((long)(int)*num_sgs)) + return ERR_PTR(*num_sgs); len = iv_size; len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += req_size; len = ALIGN(len, __alignof__(struct scatterlist)); - len += *num_sgs * sizeof(**sgl); + len += array_size(*num_sgs, sizeof(struct scatterlist)); + *sensitive_size = len; - p = kmalloc(len, GFP_ATOMIC); + p = kvzalloc(len, GFP_NOFS); if (!p) - return NULL; + return ERR_PTR(-ENOMEM); *iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1); *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, crypto_tfm_ctx_alignment()); - *sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, - __alignof__(struct scatterlist)); + sgt->sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, + __alignof__(struct scatterlist)); return p; } -static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst, +static void *smb2_get_aead_req(struct crypto_aead *tfm, struct smb_rqst *rqst, int num_rqst, const u8 *sig, u8 **iv, - struct aead_request **req, struct scatterlist **sgl) + struct aead_request **req, struct scatterlist **sgl, + size_t *sensitive_size) { - unsigned int off, len, skip; - struct scatterlist *sg; - unsigned int num_sgs; - unsigned long addr; - int i, j; + struct sg_table sgtable = {}; + unsigned int skip, num_sgs, i, j; + ssize_t rc; void *p; - p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs); - if (!p) - return NULL; + p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable, + &num_sgs, sensitive_size); + if (IS_ERR(p)) + return ERR_CAST(p); - sg_init_table(*sgl, num_sgs); - sg = *sgl; + sg_init_marker(sgtable.sgl, num_sgs); - /* Assumes the first rqst has a transform header as the first iov. - * I.e. - * rqst[0].rq_iov[0] is transform header - * rqst[0].rq_iov[1+] data to be encrypted/decrypted - * rqst[1+].rq_iov[0+] data to be encrypted/decrypted + /* + * The first rqst has a transform header where the + * first 20 bytes are not part of the encrypted blob. */ + skip = 20; + for (i = 0; i < num_rqst; i++) { - /* - * The first rqst has a transform header where the - * first 20 bytes are not part of the encrypted blob. - */ + struct iov_iter *iter = &rqst[i].rq_iter; + size_t count = iov_iter_count(iter); + for (j = 0; j < rqst[i].rq_nvec; j++) { - struct kvec *iov = &rqst[i].rq_iov[j]; + cifs_sg_set_buf(&sgtable, + rqst[i].rq_iov[j].iov_base + skip, + rqst[i].rq_iov[j].iov_len - skip); - skip = (i == 0) && (j == 0) ? 20 : 0; - addr = (unsigned long)iov->iov_base + skip; - len = iov->iov_len - skip; - sg = cifs_sg_set_buf(sg, (void *)addr, len); - } - for (j = 0; j < rqst[i].rq_npages; j++) { - rqst_page_get_length(&rqst[i], j, &len, &off); - sg_set_page(sg++, rqst[i].rq_pages[j], len, off); + /* See the above comment on the 'skip' assignment */ + skip = 0; } + sgtable.orig_nents = sgtable.nents; + + rc = netfs_extract_iter_to_sg(iter, count, &sgtable, + num_sgs - sgtable.nents, 0); + iov_iter_revert(iter, rc); + sgtable.orig_nents = sgtable.nents; } - cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE); + cifs_sg_set_buf(&sgtable, sig, SMB2_SIGNATURE_SIZE); + sg_mark_end(&sgtable.sgl[sgtable.nents - 1]); + *sgl = sgtable.sgl; return p; } @@ -4353,6 +4369,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); void *creq; + size_t sensitive_size; rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { @@ -4386,9 +4403,10 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg); - if (unlikely(!creq)) - return -ENOMEM; + creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg, + &sensitive_size); + if (IS_ERR(creq)) + return PTR_ERR(creq); if (!enc) { memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE); @@ -4416,22 +4434,35 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); - kfree_sensitive(creq); + kvfree_sensitive(creq, sensitive_size); return rc; } +/* + * Clear a read buffer, discarding the folios which have XA_MARK_0 set. + */ +static void cifs_clear_xarray_buffer(struct xarray *buffer) +{ + struct folio *folio; + + XA_STATE(xas, buffer, 0); + + rcu_read_lock(); + xas_for_each_marked(&xas, folio, ULONG_MAX, XA_MARK_0) { + folio_put(folio); + } + rcu_read_unlock(); + xa_destroy(buffer); +} + void smb3_free_compound_rqst(int num_rqst, struct smb_rqst *rqst) { - int i, j; + int i; - for (i = 0; i < num_rqst; i++) { - if (rqst[i].rq_pages) { - for (j = rqst[i].rq_npages - 1; j >= 0; j--) - put_page(rqst[i].rq_pages[j]); - kfree(rqst[i].rq_pages); - } - } + for (i = 0; i < num_rqst; i++) + if (!xa_empty(&rqst[i].rq_buffer)) + cifs_clear_xarray_buffer(&rqst[i].rq_buffer); } /* @@ -4451,9 +4482,8 @@ static int smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *new_rq, struct smb_rqst *old_rq) { - struct page **pages; struct smb2_transform_hdr *tr_hdr = new_rq[0].rq_iov[0].iov_base; - unsigned int npages; + struct page *page; unsigned int orig_len = 0; int i, j; int rc = -ENOMEM; @@ -4461,40 +4491,45 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, for (i = 1; i < num_rqst; i++) { struct smb_rqst *old = &old_rq[i - 1]; struct smb_rqst *new = &new_rq[i]; + struct xarray *buffer = &new->rq_buffer; + size_t size = iov_iter_count(&old->rq_iter), seg, copied = 0; orig_len += smb_rqst_len(server, old); new->rq_iov = old->rq_iov; new->rq_nvec = old->rq_nvec; - npages = old->rq_npages; - if (!npages) - continue; - - pages = kmalloc_array(npages, sizeof(struct page *), - GFP_KERNEL); - if (!pages) - goto err_free; - - new->rq_pages = pages; - new->rq_npages = npages; - new->rq_offset = old->rq_offset; - new->rq_pagesz = old->rq_pagesz; - new->rq_tailsz = old->rq_tailsz; - - for (j = 0; j < npages; j++) { - pages[j] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!pages[j]) - goto err_free; - } + xa_init(buffer); + + if (size > 0) { + unsigned int npages = DIV_ROUND_UP(size, PAGE_SIZE); + + for (j = 0; j < npages; j++) { + void *o; - /* copy pages form the old */ - for (j = 0; j < npages; j++) { - unsigned int offset, len; + rc = -ENOMEM; + page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); + if (!page) + goto err_free; + page->index = j; + o = xa_store(buffer, j, page, GFP_KERNEL); + if (xa_is_err(o)) { + rc = xa_err(o); + put_page(page); + goto err_free; + } - rqst_page_get_length(new, j, &len, &offset); + xa_set_mark(buffer, j, XA_MARK_0); - memcpy_page(new->rq_pages[j], offset, - old->rq_pages[j], offset, len); + seg = min_t(size_t, size - copied, PAGE_SIZE); + if (copy_page_from_iter(page, 0, seg, &old->rq_iter) != seg) { + rc = -EFAULT; + goto err_free; + } + copied += seg; + } + iov_iter_xarray(&new->rq_iter, ITER_SOURCE, + buffer, 0, size); + new->rq_iter_size = size; } } @@ -4523,12 +4558,12 @@ smb3_is_transform_hdr(void *buf) static int decrypt_raw_data(struct TCP_Server_Info *server, char *buf, - unsigned int buf_data_size, struct page **pages, - unsigned int npages, unsigned int page_data_size, + unsigned int buf_data_size, struct iov_iter *iter, bool is_offloaded) { struct kvec iov[2]; struct smb_rqst rqst = {NULL}; + size_t iter_size = 0; int rc; iov[0].iov_base = buf; @@ -4538,10 +4573,11 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, rqst.rq_iov = iov; rqst.rq_nvec = 2; - rqst.rq_pages = pages; - rqst.rq_npages = npages; - rqst.rq_pagesz = PAGE_SIZE; - rqst.rq_tailsz = (page_data_size % PAGE_SIZE) ? : PAGE_SIZE; + if (iter) { + rqst.rq_iter = *iter; + rqst.rq_iter_size = iov_iter_count(iter); + iter_size = iov_iter_count(iter); + } rc = crypt_message(server, 1, &rqst, 0); cifs_dbg(FYI, "Decrypt message returned %d\n", rc); @@ -4552,73 +4588,37 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, memmove(buf, iov[1].iov_base, buf_data_size); if (!is_offloaded) - server->total_read = buf_data_size + page_data_size; + server->total_read = buf_data_size + iter_size; return rc; } static int -read_data_into_pages(struct TCP_Server_Info *server, struct page **pages, - unsigned int npages, unsigned int len) +cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size, + unsigned int skip, struct iov_iter *iter) { - int i; - int length; + struct page *page; + unsigned long index; - for (i = 0; i < npages; i++) { - struct page *page = pages[i]; - size_t n; + xa_for_each(pages, index, page) { + size_t n, len = min_t(unsigned int, PAGE_SIZE - skip, data_size); - n = len; - if (len >= PAGE_SIZE) { - /* enough data to fill the page */ - n = PAGE_SIZE; - len -= n; - } else { - zero_user(page, len, PAGE_SIZE - len); - len = 0; + n = copy_page_to_iter(page, skip, len, iter); + if (n != len) { + cifs_dbg(VFS, "%s: something went wrong\n", __func__); + return -EIO; } - length = cifs_read_page_from_socket(server, page, 0, n); - if (length < 0) - return length; - server->total_read += length; + data_size -= n; + skip = 0; } return 0; } static int -init_read_bvec(struct page **pages, unsigned int npages, unsigned int data_size, - unsigned int cur_off, struct bio_vec **page_vec) -{ - struct bio_vec *bvec; - int i; - - bvec = kcalloc(npages, sizeof(struct bio_vec), GFP_KERNEL); - if (!bvec) - return -ENOMEM; - - for (i = 0; i < npages; i++) { - bvec[i].bv_page = pages[i]; - bvec[i].bv_offset = (i == 0) ? cur_off : 0; - bvec[i].bv_len = min_t(unsigned int, PAGE_SIZE, data_size); - data_size -= bvec[i].bv_len; - } - - if (data_size != 0) { - cifs_dbg(VFS, "%s: something went wrong\n", __func__); - kfree(bvec); - return -EIO; - } - - *page_vec = bvec; - return 0; -} - -static int handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, - char *buf, unsigned int buf_len, struct page **pages, - unsigned int npages, unsigned int page_data_size, - bool is_offloaded) + char *buf, unsigned int buf_len, struct xarray *pages, + unsigned int pages_len, bool is_offloaded) { unsigned int data_offset; unsigned int data_len; @@ -4627,9 +4627,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, unsigned int pad_len; struct cifs_readdata *rdata = mid->callback_data; struct smb2_hdr *shdr = (struct smb2_hdr *)buf; - struct bio_vec *bvec = NULL; - struct iov_iter iter; - struct kvec iov; int length; bool use_rdma_mr = false; @@ -4718,7 +4715,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - if (data_len > page_data_size - pad_len) { + if (data_len > pages_len - pad_len) { /* data_len is corrupt -- discard frame */ rdata->result = -EIO; if (is_offloaded) @@ -4728,8 +4725,9 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - rdata->result = init_read_bvec(pages, npages, page_data_size, - cur_off, &bvec); + /* Copy the data to the output I/O iterator. */ + rdata->result = cifs_copy_pages_to_iter(pages, pages_len, + cur_off, &rdata->iter); if (rdata->result != 0) { if (is_offloaded) mid->mid_state = MID_RESPONSE_MALFORMED; @@ -4737,14 +4735,16 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, dequeue_mid(mid, rdata->result); return 0; } + rdata->got_bytes = pages_len; - iov_iter_bvec(&iter, ITER_SOURCE, bvec, npages, data_len); } else if (buf_len >= data_offset + data_len) { /* read response payload is in buf */ - WARN_ONCE(npages > 0, "read data can be either in buf or in pages"); - iov.iov_base = buf + data_offset; - iov.iov_len = data_len; - iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, data_len); + WARN_ONCE(pages && !xa_empty(pages), + "read data can be either in buf or in pages"); + length = copy_to_iter(buf + data_offset, data_len, &rdata->iter); + if (length < 0) + return length; + rdata->got_bytes = data_len; } else { /* read response payload cannot be in both buf and pages */ WARN_ONCE(1, "buf can not contain only a part of read data"); @@ -4756,26 +4756,18 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - length = rdata->copy_into_pages(server, rdata, &iter); - - kfree(bvec); - - if (length < 0) - return length; - if (is_offloaded) mid->mid_state = MID_RESPONSE_RECEIVED; else dequeue_mid(mid, false); - return length; + return 0; } struct smb2_decrypt_work { struct work_struct decrypt; struct TCP_Server_Info *server; - struct page **ppages; + struct xarray buffer; char *buf; - unsigned int npages; unsigned int len; }; @@ -4784,11 +4776,13 @@ static void smb2_decrypt_offload(struct work_struct *work) { struct smb2_decrypt_work *dw = container_of(work, struct smb2_decrypt_work, decrypt); - int i, rc; + int rc; struct mid_q_entry *mid; + struct iov_iter iter; + iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, dw->len); rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size, - dw->ppages, dw->npages, dw->len, true); + &iter, true); if (rc) { cifs_dbg(VFS, "error decrypting rc=%d\n", rc); goto free_pages; @@ -4802,7 +4796,7 @@ static void smb2_decrypt_offload(struct work_struct *work) mid->decrypted = true; rc = handle_read_data(dw->server, mid, dw->buf, dw->server->vals->read_rsp_size, - dw->ppages, dw->npages, dw->len, + &dw->buffer, dw->len, true); if (rc >= 0) { #ifdef CONFIG_CIFS_STATS2 @@ -4835,10 +4829,7 @@ static void smb2_decrypt_offload(struct work_struct *work) } free_pages: - for (i = dw->npages-1; i >= 0; i--) - put_page(dw->ppages[i]); - - kfree(dw->ppages); + cifs_clear_xarray_buffer(&dw->buffer); cifs_small_buf_release(dw->buf); kfree(dw); } @@ -4848,47 +4839,66 @@ static int receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, int *num_mids) { + struct page *page; char *buf = server->smallbuf; struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; - unsigned int npages; - struct page **pages; - unsigned int len; + struct iov_iter iter; + unsigned int len, npages; unsigned int buflen = server->pdu_size; int rc; int i = 0; struct smb2_decrypt_work *dw; + dw = kzalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL); + if (!dw) + return -ENOMEM; + xa_init(&dw->buffer); + INIT_WORK(&dw->decrypt, smb2_decrypt_offload); + dw->server = server; + *num_mids = 1; len = min_t(unsigned int, buflen, server->vals->read_rsp_size + sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1; rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len); if (rc < 0) - return rc; + goto free_dw; server->total_read += rc; len = le32_to_cpu(tr_hdr->OriginalMessageSize) - server->vals->read_rsp_size; + dw->len = len; npages = DIV_ROUND_UP(len, PAGE_SIZE); - pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - rc = -ENOMEM; - goto discard_data; - } - + rc = -ENOMEM; for (; i < npages; i++) { - pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!pages[i]) { - rc = -ENOMEM; + void *old; + + page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); + if (!page) + goto discard_data; + page->index = i; + old = xa_store(&dw->buffer, i, page, GFP_KERNEL); + if (xa_is_err(old)) { + rc = xa_err(old); + put_page(page); goto discard_data; } + xa_set_mark(&dw->buffer, i, XA_MARK_0); } - /* read read data into pages */ - rc = read_data_into_pages(server, pages, npages, len); - if (rc) - goto free_pages; + iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, npages * PAGE_SIZE); + + /* Read the data into the buffer and clear excess bufferage. */ + rc = cifs_read_iter_from_socket(server, &iter, dw->len); + if (rc < 0) + goto discard_data; + + server->total_read += rc; + if (rc < npages * PAGE_SIZE) + iov_iter_zero(npages * PAGE_SIZE - rc, &iter); + iov_iter_revert(&iter, npages * PAGE_SIZE); + iov_iter_truncate(&iter, dw->len); rc = cifs_discard_remaining_data(server); if (rc) @@ -4901,39 +4911,28 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, if ((server->min_offload) && (server->in_flight > 1) && (server->pdu_size >= server->min_offload)) { - dw = kmalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL); - if (dw == NULL) - goto non_offloaded_decrypt; - dw->buf = server->smallbuf; server->smallbuf = (char *)cifs_small_buf_get(); - INIT_WORK(&dw->decrypt, smb2_decrypt_offload); - - dw->npages = npages; - dw->server = server; - dw->ppages = pages; - dw->len = len; queue_work(decrypt_wq, &dw->decrypt); *num_mids = 0; /* worker thread takes care of finding mid */ return -1; } -non_offloaded_decrypt: rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size, - pages, npages, len, false); + &iter, false); if (rc) goto free_pages; *mid = smb2_find_mid(server, buf); - if (*mid == NULL) + if (*mid == NULL) { cifs_dbg(FYI, "mid not found\n"); - else { + } else { cifs_dbg(FYI, "mid found\n"); (*mid)->decrypted = true; rc = handle_read_data(server, *mid, buf, server->vals->read_rsp_size, - pages, npages, len, false); + &dw->buffer, dw->len, false); if (rc >= 0) { if (server->ops->is_network_name_deleted) { server->ops->is_network_name_deleted(buf, @@ -4943,9 +4942,9 @@ non_offloaded_decrypt: } free_pages: - for (i = i - 1; i >= 0; i--) - put_page(pages[i]); - kfree(pages); + cifs_clear_xarray_buffer(&dw->buffer); +free_dw: + kfree(dw); return rc; discard_data: cifs_discard_remaining_data(server); @@ -4983,7 +4982,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, server->total_read += length; buf_size = pdu_length - sizeof(struct smb2_transform_hdr); - length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0, false); + length = decrypt_raw_data(server, buf, buf_size, NULL, false); if (length) return length; @@ -5082,7 +5081,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) char *buf = server->large_buf ? server->bigbuf : server->smallbuf; return handle_read_data(server, mid, buf, server->pdu_size, - NULL, 0, 0, false); + NULL, 0, false); } static int @@ -5134,15 +5133,16 @@ smb2_make_node(unsigned int xid, struct inode *inode, cifs_dbg(FYI, "sfu compat create special file\n"); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | - CREATE_OPTION_SPECIAL); - oparms.disposition = FILE_CREATE; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | + CREATE_OPTION_SPECIAL), + .disposition = FILE_CREATE, + .path = full_path, + .fid = &fid, + }; if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; @@ -5629,7 +5629,7 @@ struct smb_version_values smb20_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5651,7 +5651,7 @@ struct smb_version_values smb21_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5672,7 +5672,7 @@ struct smb_version_values smb3any_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5693,7 +5693,7 @@ struct smb_version_values smbdefault_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5714,7 +5714,7 @@ struct smb_version_values smb30_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5735,7 +5735,7 @@ struct smb_version_values smb302_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5756,7 +5756,7 @@ struct smb_version_values smb311_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2c9ffa921e6f..ca9d7110ddcb 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -139,6 +139,66 @@ out: return; } +static int wait_for_server_reconnect(struct TCP_Server_Info *server, + __le16 smb2_command, bool retry) +{ + int timeout = 10; + int rc; + + spin_lock(&server->srv_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&server->srv_lock); + return 0; + } + timeout *= server->nr_targets; + spin_unlock(&server->srv_lock); + + /* + * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE + * here since they are implicitly done when session drops. + */ + switch (smb2_command) { + /* + * BB Should we keep oplock break and add flush to exceptions? + */ + case SMB2_TREE_DISCONNECT: + case SMB2_CANCEL: + case SMB2_CLOSE: + case SMB2_OPLOCK_BREAK: + return -EAGAIN; + } + + /* + * Give demultiplex thread up to 10 seconds to each target available for + * reconnect -- should be greater than cifs socket timeout which is 7 + * seconds. + * + * On "soft" mounts we wait once. Hard mounts keep retrying until + * process is killed or server comes back on-line. + */ + do { + rc = wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus != CifsNeedReconnect), + timeout * HZ); + if (rc < 0) { + cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n", + __func__); + return -ERESTARTSYS; + } + + /* are we still trying to reconnect? */ + spin_lock(&server->srv_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&server->srv_lock); + return 0; + } + spin_unlock(&server->srv_lock); + } while (retry); + + cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__); + return -EHOSTDOWN; +} + static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct TCP_Server_Info *server) @@ -146,7 +206,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, int rc = 0; struct nls_table *nls_codepage; struct cifs_ses *ses; - int retries; /* * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so @@ -184,61 +243,11 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, (!tcon->ses->server) || !server) return -EIO; - ses = tcon->ses; - retries = server->nr_targets; - - /* - * Give demultiplex thread up to 10 seconds to each target available for - * reconnect -- should be greater than cifs socket timeout which is 7 - * seconds. - */ - while (server->tcpStatus == CifsNeedReconnect) { - /* - * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE - * here since they are implicitly done when session drops. - */ - switch (smb2_command) { - /* - * BB Should we keep oplock break and add flush to exceptions? - */ - case SMB2_TREE_DISCONNECT: - case SMB2_CANCEL: - case SMB2_CLOSE: - case SMB2_OPLOCK_BREAK: - return -EAGAIN; - } - - rc = wait_event_interruptible_timeout(server->response_q, - (server->tcpStatus != CifsNeedReconnect), - 10 * HZ); - if (rc < 0) { - cifs_dbg(FYI, "%s: aborting reconnect due to a received signal by the process\n", - __func__); - return -ERESTARTSYS; - } - - /* are we still trying to reconnect? */ - spin_lock(&server->srv_lock); - if (server->tcpStatus != CifsNeedReconnect) { - spin_unlock(&server->srv_lock); - break; - } - spin_unlock(&server->srv_lock); - - if (retries && --retries) - continue; + rc = wait_for_server_reconnect(server, smb2_command, tcon->retry); + if (rc) + return rc; - /* - * on "soft" mounts we wait once. Hard mounts keep - * retrying until process is killed or server comes - * back on-line - */ - if (!tcon->retry) { - cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); - return -EHOSTDOWN; - } - retries = server->nr_targets; - } + ses = tcon->ses; spin_lock(&ses->chan_lock); if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) { @@ -1364,7 +1373,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) /* Testing shows that buffer offset must be at location of Buffer[0] */ req->SecurityBufferOffset = - cpu_to_le16(sizeof(struct smb2_sess_setup_req) - 1 /* pad */); + cpu_to_le16(sizeof(struct smb2_sess_setup_req)); req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); memset(&rqst, 0, sizeof(struct smb_rqst)); @@ -1858,12 +1867,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, if (unc_path == NULL) return -ENOMEM; - unc_path_len = cifs_strtoUTF16(unc_path, tree, strlen(tree), cp) + 1; - unc_path_len *= 2; - if (unc_path_len < 2) { + unc_path_len = cifs_strtoUTF16(unc_path, tree, strlen(tree), cp); + if (unc_path_len <= 0) { kfree(unc_path); return -EINVAL; } + unc_path_len *= 2; /* SMB2 TREE_CONNECT request must be called with TreeId == 0 */ tcon->tid = 0; @@ -1883,9 +1892,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, iov[0].iov_len = total_len - 1; /* Testing shows that buffer offset must be at location of Buffer[0] */ - req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req) - - 1 /* pad */); - req->PathLength = cpu_to_le16(unc_path_len - 2); + req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req)); + req->PathLength = cpu_to_le16(unc_path_len); iov[1].iov_base = unc_path; iov[1].iov_len = unc_path_len; @@ -3764,7 +3772,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, ses->Suid, (u8)watch_tree, completion_filter); /* validate that notify information is plausible */ if ((rsp_iov.iov_base == NULL) || - (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp))) + (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp) + 1)) goto cnotify_exit; smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base; @@ -3898,7 +3906,7 @@ void smb2_reconnect_server(struct work_struct *work) goto done; /* allocate a dummy tcon struct used for reconnect */ - tcon = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL); + tcon = tconInfoAlloc(); if (!tcon) { resched = true; list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { @@ -3921,7 +3929,7 @@ void smb2_reconnect_server(struct work_struct *work) list_del_init(&ses->rlist); cifs_put_smb_ses(ses); } - kfree(tcon); + tconInfoFree(tcon); done: cifs_dbg(FYI, "Reconnecting tcons and channels finished\n"); @@ -4054,6 +4062,36 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, return rc; } +#ifdef CONFIG_CIFS_SMB_DIRECT +static inline bool smb3_use_rdma_offload(struct cifs_io_parms *io_parms) +{ + struct TCP_Server_Info *server = io_parms->server; + struct cifs_tcon *tcon = io_parms->tcon; + + /* we can only offload if we're connected */ + if (!server || !tcon) + return false; + + /* we can only offload on an rdma connection */ + if (!server->rdma || !server->smbd_conn) + return false; + + /* we don't support signed offload yet */ + if (server->sign) + return false; + + /* we don't support encrypted offload yet */ + if (smb3_encryption_required(tcon)) + return false; + + /* offload also has its overhead, so only do it if desired */ + if (io_parms->length < server->smbd_conn->rdma_readwrite_threshold) + return false; + + return true; +} +#endif /* CONFIG_CIFS_SMB_DIRECT */ + /* * To form a chain of read requests, any read requests after the first should * have the end_of_chain boolean set to true. @@ -4097,16 +4135,12 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * If we want to do a RDMA write, fill in and append * smbd_buffer_descriptor_v1 to the end of read request */ - if (server->rdma && rdata && !server->sign && - rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) { - + if (smb3_use_rdma_offload(io_parms)) { struct smbd_buffer_descriptor_v1 *v1; bool need_invalidate = server->dialect == SMB30_PROT_ID; - rdata->mr = smbd_register_mr( - server->smbd_conn, rdata->pages, - rdata->nr_pages, rdata->page_offset, - rdata->tailsz, true, need_invalidate); + rdata->mr = smbd_register_mr(server->smbd_conn, &rdata->iter, + true, need_invalidate); if (!rdata->mr) return -EAGAIN; @@ -4163,15 +4197,9 @@ smb2_readv_callback(struct mid_q_entry *mid) (struct smb2_hdr *)rdata->iov[0].iov_base; struct cifs_credits credits = { .value = 0, .instance = 0 }; struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], - .rq_nvec = 1, }; - - if (rdata->got_bytes) { - rqst.rq_pages = rdata->pages; - rqst.rq_offset = rdata->page_offset; - rqst.rq_npages = rdata->nr_pages; - rqst.rq_pagesz = rdata->pagesz; - rqst.rq_tailsz = rdata->tailsz; - } + .rq_nvec = 1, + .rq_iter = rdata->iter, + .rq_iter_size = iov_iter_count(&rdata->iter), }; WARN_ONCE(rdata->server != mid->server, "rdata server %p != mid server %p", @@ -4189,6 +4217,8 @@ smb2_readv_callback(struct mid_q_entry *mid) if (server->sign && !mid->decrypted) { int rc; + iov_iter_revert(&rqst.rq_iter, rdata->got_bytes); + iov_iter_truncate(&rqst.rq_iter, rdata->got_bytes); rc = smb2_verify_signature(&rqst, server); if (rc) cifs_tcon_dbg(VFS, "SMB signature verification returned error = %d\n", @@ -4495,10 +4525,27 @@ smb2_async_writev(struct cifs_writedata *wdata, struct kvec iov[1]; struct smb_rqst rqst = { }; unsigned int total_len; + struct cifs_io_parms _io_parms; + struct cifs_io_parms *io_parms = NULL; if (!wdata->server) server = wdata->server = cifs_pick_channel(tcon->ses); + /* + * in future we may get cifs_io_parms passed in from the caller, + * but for now we construct it here... + */ + _io_parms = (struct cifs_io_parms) { + .tcon = tcon, + .server = server, + .offset = wdata->offset, + .length = wdata->bytes, + .persistent_fid = wdata->cfile->fid.persistent_fid, + .volatile_fid = wdata->cfile->fid.volatile_fid, + .pid = wdata->pid, + }; + io_parms = &_io_parms; + rc = smb2_plain_req_init(SMB2_WRITE, tcon, server, (void **) &req, &total_len); if (rc) @@ -4508,49 +4555,44 @@ smb2_async_writev(struct cifs_writedata *wdata, flags |= CIFS_TRANSFORM_REQ; shdr = (struct smb2_hdr *)req; - shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid); + shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = wdata->cfile->fid.persistent_fid; - req->VolatileFileId = wdata->cfile->fid.volatile_fid; + req->PersistentFileId = io_parms->persistent_fid; + req->VolatileFileId = io_parms->volatile_fid; req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; - req->Channel = 0; - req->Offset = cpu_to_le64(wdata->offset); + req->Channel = SMB2_CHANNEL_NONE; + req->Offset = cpu_to_le64(io_parms->offset); req->DataOffset = cpu_to_le16( offsetof(struct smb2_write_req, Buffer)); req->RemainingBytes = 0; - trace_smb3_write_enter(0 /* xid */, wdata->cfile->fid.persistent_fid, - tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes); + trace_smb3_write_enter(0 /* xid */, + io_parms->persistent_fid, + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, + io_parms->length); + #ifdef CONFIG_CIFS_SMB_DIRECT /* * If we want to do a server RDMA read, fill in and append * smbd_buffer_descriptor_v1 to the end of write request */ - if (server->rdma && !server->sign && wdata->bytes >= - server->smbd_conn->rdma_readwrite_threshold) { - + if (smb3_use_rdma_offload(io_parms)) { struct smbd_buffer_descriptor_v1 *v1; + size_t data_size = iov_iter_count(&wdata->iter); bool need_invalidate = server->dialect == SMB30_PROT_ID; - wdata->mr = smbd_register_mr( - server->smbd_conn, wdata->pages, - wdata->nr_pages, wdata->page_offset, - wdata->tailsz, false, need_invalidate); + wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->iter, + false, need_invalidate); if (!wdata->mr) { rc = -EAGAIN; goto async_writev_out; } req->Length = 0; req->DataOffset = 0; - if (wdata->nr_pages > 1) - req->RemainingBytes = - cpu_to_le32( - (wdata->nr_pages - 1) * wdata->pagesz - - wdata->page_offset + wdata->tailsz - ); - else - req->RemainingBytes = cpu_to_le32(wdata->tailsz); + req->RemainingBytes = cpu_to_le32(data_size); req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE; if (need_invalidate) req->Channel = SMB2_CHANNEL_RDMA_V1; @@ -4569,26 +4611,21 @@ smb2_async_writev(struct cifs_writedata *wdata, rqst.rq_iov = iov; rqst.rq_nvec = 1; - rqst.rq_pages = wdata->pages; - rqst.rq_offset = wdata->page_offset; - rqst.rq_npages = wdata->nr_pages; - rqst.rq_pagesz = wdata->pagesz; - rqst.rq_tailsz = wdata->tailsz; + rqst.rq_iter = wdata->iter; + rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter); #ifdef CONFIG_CIFS_SMB_DIRECT - if (wdata->mr) { + if (wdata->mr) iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); - rqst.rq_npages = 0; - } #endif - cifs_dbg(FYI, "async write at %llu %u bytes\n", - wdata->offset, wdata->bytes); + cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", + io_parms->offset, io_parms->length, iov_iter_count(&rqst.rq_iter)); #ifdef CONFIG_CIFS_SMB_DIRECT /* For RDMA read, I/O size is in RemainingBytes not in Length */ if (!wdata->mr) - req->Length = cpu_to_le32(wdata->bytes); + req->Length = cpu_to_le32(io_parms->length); #else - req->Length = cpu_to_le32(wdata->bytes); + req->Length = cpu_to_le32(io_parms->length); #endif if (wdata->credits.value > 0) { @@ -4596,7 +4633,7 @@ smb2_async_writev(struct cifs_writedata *wdata, SMB2_MAX_BUFFER_SIZE)); shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8); - rc = adjust_credits(server, &wdata->credits, wdata->bytes); + rc = adjust_credits(server, &wdata->credits, io_parms->length); if (rc) goto async_writev_out; @@ -4609,9 +4646,12 @@ smb2_async_writev(struct cifs_writedata *wdata, if (rc) { trace_smb3_write_err(0 /* no xid */, - req->PersistentFileId, - tcon->tid, tcon->ses->Suid, wdata->offset, - wdata->bytes, rc); + io_parms->persistent_fid, + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, + io_parms->length, + rc); kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); } @@ -4906,7 +4946,7 @@ int SMB2_query_directory_init(const unsigned int xid, memcpy(bufptr, &asteriks, len); req->FileNameOffset = - cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1); + cpu_to_le16(sizeof(struct smb2_query_directory_req)); req->FileNameLength = cpu_to_le16(len); /* * BB could be 30 bytes or so longer if we used SMB2 specific @@ -4951,10 +4991,10 @@ smb2_parse_query_directory(struct cifs_tcon *tcon, switch (srch_inf->info_level) { case SMB_FIND_FILE_DIRECTORY_INFO: - info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1; + info_buf_size = sizeof(FILE_DIRECTORY_INFO); break; case SMB_FIND_FILE_ID_FULL_DIR_INFO: - info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; + info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO); break; case SMB_FIND_FILE_POSIX_INFO: /* note that posix payload are variable size */ @@ -5102,8 +5142,7 @@ SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, req->VolatileFileId = volatile_fid; req->AdditionalInformation = cpu_to_le32(additional_info); - req->BufferOffset = - cpu_to_le16(sizeof(struct smb2_set_info_req) - 1); + req->BufferOffset = cpu_to_le16(sizeof(struct smb2_set_info_req)); req->BufferLength = cpu_to_le32(*size); memcpy(req->Buffer, *data, *size); @@ -5337,9 +5376,9 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, req->VolatileFileId = volatile_fid; /* 1 for pad */ req->InputBufferOffset = - cpu_to_le16(sizeof(struct smb2_query_info_req) - 1); + cpu_to_le16(sizeof(struct smb2_query_info_req)); req->OutputBufferLength = cpu_to_le32( - outbuf_len + sizeof(struct smb2_query_info_rsp) - 1); + outbuf_len + sizeof(struct smb2_query_info_rsp)); iov->iov_base = (char *)req; iov->iov_len = total_len; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 1237bb86e93a..2114e8a0c63a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -57,7 +57,7 @@ struct smb2_rdma_crypto_transform { #define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL #define SMB2_SYMLINK_STRUCT_SIZE \ - (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) + (sizeof(struct smb2_err_rsp) + sizeof(struct smb2_symlink_err_rsp)) #define SYMLINK_ERROR_TAG 0x4c4d5953 @@ -371,7 +371,7 @@ struct smb2_file_id_extd_directory_info { __le32 EaSize; /* EA size */ __le32 ReparsePointTag; /* valid if FILE_ATTR_REPARSE_POINT set in FileAttributes */ __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit */ - char FileName[1]; + char FileName[]; } __packed; /* level 60 */ extern char smb2_padding[7]; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 90789aaa6567..55b6e319a61d 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -34,16 +34,21 @@ static int smbd_post_recv( struct smbd_response *response); static int smbd_post_send_empty(struct smbd_connection *info); -static int smbd_post_send_data( - struct smbd_connection *info, - struct kvec *iov, int n_vec, int remaining_data_length); -static int smbd_post_send_page(struct smbd_connection *info, - struct page *page, unsigned long offset, - size_t size, int remaining_data_length); static void destroy_mr_list(struct smbd_connection *info); static int allocate_mr_list(struct smbd_connection *info); +struct smb_extract_to_rdma { + struct ib_sge *sge; + unsigned int nr_sge; + unsigned int max_sge; + struct ib_device *device; + u32 local_dma_lkey; + enum dma_data_direction direction; +}; +static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, + struct smb_extract_to_rdma *rdma); + /* SMBD version number */ #define SMBD_V1 0x0100 @@ -823,16 +828,16 @@ static int smbd_post_send(struct smbd_connection *info, return rc; } -static int smbd_post_send_sgl(struct smbd_connection *info, - struct scatterlist *sgl, int data_length, int remaining_data_length) +static int smbd_post_send_iter(struct smbd_connection *info, + struct iov_iter *iter, + int *_remaining_data_length) { - int num_sgs; int i, rc; int header_length; + int data_length; struct smbd_request *request; struct smbd_data_transfer *packet; int new_credits; - struct scatterlist *sg; wait_credit: /* Wait for send credits. A SMBD packet needs one credit */ @@ -876,6 +881,30 @@ wait_send_queue: } request->info = info; + memset(request->sge, 0, sizeof(request->sge)); + + /* Fill in the data payload to find out how much data we can add */ + if (iter) { + struct smb_extract_to_rdma extract = { + .nr_sge = 1, + .max_sge = SMBDIRECT_MAX_SEND_SGE, + .sge = request->sge, + .device = info->id->device, + .local_dma_lkey = info->pd->local_dma_lkey, + .direction = DMA_TO_DEVICE, + }; + + rc = smb_extract_iter_to_rdma(iter, *_remaining_data_length, + &extract); + if (rc < 0) + goto err_dma; + data_length = rc; + request->num_sge = extract.nr_sge; + *_remaining_data_length -= data_length; + } else { + data_length = 0; + request->num_sge = 1; + } /* Fill in the packet header */ packet = smbd_request_payload(request); @@ -897,7 +926,7 @@ wait_send_queue: else packet->data_offset = cpu_to_le32(24); packet->data_length = cpu_to_le32(data_length); - packet->remaining_data_length = cpu_to_le32(remaining_data_length); + packet->remaining_data_length = cpu_to_le32(*_remaining_data_length); packet->padding = 0; log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n", @@ -913,7 +942,6 @@ wait_send_queue: if (!data_length) header_length = offsetof(struct smbd_data_transfer, padding); - request->num_sge = 1; request->sge[0].addr = ib_dma_map_single(info->id->device, (void *)packet, header_length, @@ -927,23 +955,6 @@ wait_send_queue: request->sge[0].length = header_length; request->sge[0].lkey = info->pd->local_dma_lkey; - /* Fill in the packet data payload */ - num_sgs = sgl ? sg_nents(sgl) : 0; - for_each_sg(sgl, sg, num_sgs, i) { - request->sge[i+1].addr = - ib_dma_map_page(info->id->device, sg_page(sg), - sg->offset, sg->length, DMA_TO_DEVICE); - if (ib_dma_mapping_error( - info->id->device, request->sge[i+1].addr)) { - rc = -EIO; - request->sge[i+1].addr = 0; - goto err_dma; - } - request->sge[i+1].length = sg->length; - request->sge[i+1].lkey = info->pd->local_dma_lkey; - request->num_sge++; - } - rc = smbd_post_send(info, request); if (!rc) return 0; @@ -976,61 +987,16 @@ err_wait_credit: } /* - * Send a page - * page: the page to send - * offset: offset in the page to send - * size: length in the page to send - * remaining_data_length: remaining data to send in this payload - */ -static int smbd_post_send_page(struct smbd_connection *info, struct page *page, - unsigned long offset, size_t size, int remaining_data_length) -{ - struct scatterlist sgl; - - sg_init_table(&sgl, 1); - sg_set_page(&sgl, page, size, offset); - - return smbd_post_send_sgl(info, &sgl, size, remaining_data_length); -} - -/* * Send an empty message * Empty message is used to extend credits to peer to for keep live * while there is no upper layer payload to send at the time */ static int smbd_post_send_empty(struct smbd_connection *info) { - info->count_send_empty++; - return smbd_post_send_sgl(info, NULL, 0, 0); -} - -/* - * Send a data buffer - * iov: the iov array describing the data buffers - * n_vec: number of iov array - * remaining_data_length: remaining data to send following this packet - * in segmented SMBD packet - */ -static int smbd_post_send_data( - struct smbd_connection *info, struct kvec *iov, int n_vec, - int remaining_data_length) -{ - int i; - u32 data_length = 0; - struct scatterlist sgl[SMBDIRECT_MAX_SEND_SGE - 1]; - - if (n_vec > SMBDIRECT_MAX_SEND_SGE - 1) { - cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec); - return -EINVAL; - } + int remaining_data_length = 0; - sg_init_table(sgl, n_vec); - for (i = 0; i < n_vec; i++) { - data_length += iov[i].iov_len; - sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len); - } - - return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length); + info->count_send_empty++; + return smbd_post_send_iter(info, NULL, &remaining_data_length); } /* @@ -1405,6 +1371,7 @@ void smbd_destroy(struct TCP_Server_Info *server) destroy_workqueue(info->workqueue); log_rdma_event(INFO, "rdma session destroyed\n"); kfree(info); + server->smbd_conn = NULL; } /* @@ -1699,6 +1666,7 @@ static struct smbd_connection *_smbd_get_connection( allocate_mr_failed: /* At this point, need to a full transport shutdown */ + server->smbd_conn = info; smbd_destroy(server); return NULL; @@ -1984,18 +1952,10 @@ int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst_array) { struct smbd_connection *info = server->smbd_conn; - struct kvec vecs[SMBDIRECT_MAX_SEND_SGE - 1]; - int nvecs; - int size; - unsigned int buflen, remaining_data_length; - unsigned int offset, remaining_vec_data_length; - int start, i, j; - int max_iov_size = - info->max_send_size - sizeof(struct smbd_data_transfer); - struct kvec *iov; - int rc; struct smb_rqst *rqst; - int rqst_idx; + struct iov_iter iter; + unsigned int remaining_data_length, klen; + int rc, i, rqst_idx; if (info->transport_status != SMBD_CONNECTED) return -EAGAIN; @@ -2022,84 +1982,36 @@ int smbd_send(struct TCP_Server_Info *server, rqst_idx = 0; do { rqst = &rqst_array[rqst_idx]; - iov = rqst->rq_iov; cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", - rqst_idx, smb_rqst_len(server, rqst)); - remaining_vec_data_length = 0; - for (i = 0; i < rqst->rq_nvec; i++) { - remaining_vec_data_length += iov[i].iov_len; - dump_smb(iov[i].iov_base, iov[i].iov_len); - } - - log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", - rqst_idx, rqst->rq_nvec, - rqst->rq_npages, rqst->rq_pagesz, - rqst->rq_tailsz, smb_rqst_len(server, rqst)); - - start = 0; - offset = 0; - do { - buflen = 0; - i = start; - j = 0; - while (i < rqst->rq_nvec && - j < SMBDIRECT_MAX_SEND_SGE - 1 && - buflen < max_iov_size) { - - vecs[j].iov_base = iov[i].iov_base + offset; - if (buflen + iov[i].iov_len > max_iov_size) { - vecs[j].iov_len = - max_iov_size - iov[i].iov_len; - buflen = max_iov_size; - offset = vecs[j].iov_len; - } else { - vecs[j].iov_len = - iov[i].iov_len - offset; - buflen += vecs[j].iov_len; - offset = 0; - ++i; - } - ++j; - } + rqst_idx, smb_rqst_len(server, rqst)); + for (i = 0; i < rqst->rq_nvec; i++) + dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len); + + log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n", + rqst_idx, rqst->rq_nvec, remaining_data_length, + iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst)); + + /* Send the metadata pages. */ + klen = 0; + for (i = 0; i < rqst->rq_nvec; i++) + klen += rqst->rq_iov[i].iov_len; + iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); + + rc = smbd_post_send_iter(info, &iter, &remaining_data_length); + if (rc < 0) + break; - remaining_vec_data_length -= buflen; - remaining_data_length -= buflen; - log_write(INFO, "sending %s iov[%d] from start=%d nvecs=%d remaining_data_length=%d\n", - remaining_vec_data_length > 0 ? - "partial" : "complete", - rqst->rq_nvec, start, j, - remaining_data_length); - - start = i; - rc = smbd_post_send_data(info, vecs, j, remaining_data_length); - if (rc) - goto done; - } while (remaining_vec_data_length > 0); - - /* now sending pages if there are any */ - for (i = 0; i < rqst->rq_npages; i++) { - rqst_page_get_length(rqst, i, &buflen, &offset); - nvecs = (buflen + max_iov_size - 1) / max_iov_size; - log_write(INFO, "sending pages buflen=%d nvecs=%d\n", - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - size = min_t(unsigned int, max_iov_size, remaining_data_length); - remaining_data_length -= size; - log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", - i, j * max_iov_size + offset, size, - remaining_data_length); - rc = smbd_post_send_page( - info, rqst->rq_pages[i], - j*max_iov_size + offset, - size, remaining_data_length); - if (rc) - goto done; - } + if (iov_iter_count(&rqst->rq_iter) > 0) { + /* And then the data pages if there are any */ + rc = smbd_post_send_iter(info, &rqst->rq_iter, + &remaining_data_length); + if (rc < 0) + break; } + } while (++rqst_idx < num_rqst); -done: /* * As an optimization, we don't wait for individual I/O to finish * before sending the next one. @@ -2190,10 +2102,10 @@ static void destroy_mr_list(struct smbd_connection *info) cancel_work_sync(&info->mr_recovery_work); list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { if (mr->state == MR_INVALIDATED) - ib_dma_unmap_sg(info->id->device, mr->sgl, - mr->sgl_count, mr->dir); + ib_dma_unmap_sg(info->id->device, mr->sgt.sgl, + mr->sgt.nents, mr->dir); ib_dereg_mr(mr->mr); - kfree(mr->sgl); + kfree(mr->sgt.sgl); kfree(mr); } } @@ -2216,6 +2128,7 @@ static int allocate_mr_list(struct smbd_connection *info) atomic_set(&info->mr_ready_count, 0); atomic_set(&info->mr_used_count, 0); init_waitqueue_head(&info->wait_for_mr_cleanup); + INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); /* Allocate more MRs (2x) than hardware responder_resources */ for (i = 0; i < info->responder_resources * 2; i++) { smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); @@ -2228,11 +2141,10 @@ static int allocate_mr_list(struct smbd_connection *info) info->mr_type, info->max_frmr_depth); goto out; } - smbdirect_mr->sgl = kcalloc( - info->max_frmr_depth, - sizeof(struct scatterlist), - GFP_KERNEL); - if (!smbdirect_mr->sgl) { + smbdirect_mr->sgt.sgl = kcalloc(info->max_frmr_depth, + sizeof(struct scatterlist), + GFP_KERNEL); + if (!smbdirect_mr->sgt.sgl) { log_rdma_mr(ERR, "failed to allocate sgl\n"); ib_dereg_mr(smbdirect_mr->mr); goto out; @@ -2243,15 +2155,15 @@ static int allocate_mr_list(struct smbd_connection *info) list_add_tail(&smbdirect_mr->list, &info->mr_list); atomic_inc(&info->mr_ready_count); } - INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); return 0; out: kfree(smbdirect_mr); list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) { + list_del(&smbdirect_mr->list); ib_dereg_mr(smbdirect_mr->mr); - kfree(smbdirect_mr->sgl); + kfree(smbdirect_mr->sgt.sgl); kfree(smbdirect_mr); } return -ENOMEM; @@ -2304,26 +2216,45 @@ again: } /* + * Transcribe the pages from an iterator into an MR scatterlist. + */ +static int smbd_iter_to_mr(struct smbd_connection *info, + struct iov_iter *iter, + struct sg_table *sgt, + unsigned int max_sg) +{ + int ret; + + memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist)); + + ret = netfs_extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0); + WARN_ON(ret < 0); + if (sgt->nents > 0) + sg_mark_end(&sgt->sgl[sgt->nents - 1]); + return ret; +} + +/* * Register memory for RDMA read/write - * pages[]: the list of pages to register memory with - * num_pages: the number of pages to register - * tailsz: if non-zero, the bytes to register in the last page + * iter: the buffer to register memory with * writing: true if this is a RDMA write (SMB read), false for RDMA read * need_invalidate: true if this MR needs to be locally invalidated after I/O * return value: the MR registered, NULL if failed. */ -struct smbd_mr *smbd_register_mr( - struct smbd_connection *info, struct page *pages[], int num_pages, - int offset, int tailsz, bool writing, bool need_invalidate) +struct smbd_mr *smbd_register_mr(struct smbd_connection *info, + struct iov_iter *iter, + bool writing, bool need_invalidate) { struct smbd_mr *smbdirect_mr; - int rc, i; + int rc, num_pages; enum dma_data_direction dir; struct ib_reg_wr *reg_wr; + num_pages = iov_iter_npages(iter, info->max_frmr_depth + 1); if (num_pages > info->max_frmr_depth) { log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n", num_pages, info->max_frmr_depth); + WARN_ON_ONCE(1); return NULL; } @@ -2332,45 +2263,31 @@ struct smbd_mr *smbd_register_mr( log_rdma_mr(ERR, "get_mr returning NULL\n"); return NULL; } - smbdirect_mr->need_invalidate = need_invalidate; - smbdirect_mr->sgl_count = num_pages; - sg_init_table(smbdirect_mr->sgl, num_pages); - - log_rdma_mr(INFO, "num_pages=0x%x offset=0x%x tailsz=0x%x\n", - num_pages, offset, tailsz); - - if (num_pages == 1) { - sg_set_page(&smbdirect_mr->sgl[0], pages[0], tailsz, offset); - goto skip_multiple_pages; - } - /* We have at least two pages to register */ - sg_set_page( - &smbdirect_mr->sgl[0], pages[0], PAGE_SIZE - offset, offset); - i = 1; - while (i < num_pages - 1) { - sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0); - i++; - } - sg_set_page(&smbdirect_mr->sgl[i], pages[i], - tailsz ? tailsz : PAGE_SIZE, 0); - -skip_multiple_pages: dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; smbdirect_mr->dir = dir; - rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir); + smbdirect_mr->need_invalidate = need_invalidate; + smbdirect_mr->sgt.nents = 0; + smbdirect_mr->sgt.orig_nents = 0; + + log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n", + num_pages, iov_iter_count(iter), info->max_frmr_depth); + smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth); + + rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgt.sgl, + smbdirect_mr->sgt.nents, dir); if (!rc) { log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", num_pages, dir, rc); goto dma_map_error; } - rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages, - NULL, PAGE_SIZE); - if (rc != num_pages) { + rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgt.sgl, + smbdirect_mr->sgt.nents, NULL, PAGE_SIZE); + if (rc != smbdirect_mr->sgt.nents) { log_rdma_mr(ERR, - "ib_map_mr_sg failed rc = %d num_pages = %x\n", - rc, num_pages); + "ib_map_mr_sg failed rc = %d nents = %x\n", + rc, smbdirect_mr->sgt.nents); goto map_mr_error; } @@ -2402,8 +2319,8 @@ skip_multiple_pages: /* If all failed, attempt to recover this MR by setting it MR_ERROR*/ map_mr_error: - ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, smbdirect_mr->dir); + ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgt.sgl, + smbdirect_mr->sgt.nents, smbdirect_mr->dir); dma_map_error: smbdirect_mr->state = MR_ERROR; @@ -2470,8 +2387,8 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) if (smbdirect_mr->state == MR_INVALIDATED) { ib_dma_unmap_sg( - info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, + info->id->device, smbdirect_mr->sgt.sgl, + smbdirect_mr->sgt.nents, smbdirect_mr->dir); smbdirect_mr->state = MR_READY; if (atomic_inc_return(&info->mr_ready_count) == 1) @@ -2489,3 +2406,206 @@ done: return rc; } + +static bool smb_set_sge(struct smb_extract_to_rdma *rdma, + struct page *lowest_page, size_t off, size_t len) +{ + struct ib_sge *sge = &rdma->sge[rdma->nr_sge]; + u64 addr; + + addr = ib_dma_map_page(rdma->device, lowest_page, + off, len, rdma->direction); + if (ib_dma_mapping_error(rdma->device, addr)) + return false; + + sge->addr = addr; + sge->length = len; + sge->lkey = rdma->local_dma_lkey; + rdma->nr_sge++; + return true; +} + +/* + * Extract page fragments from a BVEC-class iterator and add them to an RDMA + * element list. The pages are not pinned. + */ +static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter, + struct smb_extract_to_rdma *rdma, + ssize_t maxsize) +{ + const struct bio_vec *bv = iter->bvec; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + size_t off, len; + + len = bv[i].bv_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + off = bv[i].bv_offset + start; + + if (!smb_set_sge(rdma, bv[i].bv_page, off, len)) + return -EIO; + + ret += len; + maxsize -= len; + if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) + break; + start = 0; + } + + return ret; +} + +/* + * Extract fragments from a KVEC-class iterator and add them to an RDMA list. + * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers. + * The pages are not pinned. + */ +static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter, + struct smb_extract_to_rdma *rdma, + ssize_t maxsize) +{ + const struct kvec *kv = iter->kvec; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + struct page *page; + unsigned long kaddr; + size_t off, len, seg; + + len = kv[i].iov_len; + if (start >= len) { + start -= len; + continue; + } + + kaddr = (unsigned long)kv[i].iov_base + start; + off = kaddr & ~PAGE_MASK; + len = min_t(size_t, maxsize, len - start); + kaddr &= PAGE_MASK; + + maxsize -= len; + do { + seg = min_t(size_t, len, PAGE_SIZE - off); + + if (is_vmalloc_or_module_addr((void *)kaddr)) + page = vmalloc_to_page((void *)kaddr); + else + page = virt_to_page(kaddr); + + if (!smb_set_sge(rdma, page, off, seg)) + return -EIO; + + ret += seg; + len -= seg; + kaddr += PAGE_SIZE; + off = 0; + } while (len > 0 && rdma->nr_sge < rdma->max_sge); + + if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) + break; + start = 0; + } + + return ret; +} + +/* + * Extract folio fragments from an XARRAY-class iterator and add them to an + * RDMA list. The folios are not pinned. + */ +static ssize_t smb_extract_xarray_to_rdma(struct iov_iter *iter, + struct smb_extract_to_rdma *rdma, + ssize_t maxsize) +{ + struct xarray *xa = iter->xarray; + struct folio *folio; + loff_t start = iter->xarray_start + iter->iov_offset; + pgoff_t index = start / PAGE_SIZE; + ssize_t ret = 0; + size_t off, len; + XA_STATE(xas, xa, index); + + rcu_read_lock(); + + xas_for_each(&xas, folio, ULONG_MAX) { + if (xas_retry(&xas, folio)) + continue; + if (WARN_ON(xa_is_value(folio))) + break; + if (WARN_ON(folio_test_hugetlb(folio))) + break; + + off = offset_in_folio(folio, start); + len = min_t(size_t, maxsize, folio_size(folio) - off); + + if (!smb_set_sge(rdma, folio_page(folio, 0), off, len)) { + rcu_read_unlock(); + return -EIO; + } + + maxsize -= len; + ret += len; + if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) + break; + } + + rcu_read_unlock(); + return ret; +} + +/* + * Extract page fragments from up to the given amount of the source iterator + * and build up an RDMA list that refers to all of those bits. The RDMA list + * is appended to, up to the maximum number of elements set in the parameter + * block. + * + * The extracted page fragments are not pinned or ref'd in any way; if an + * IOVEC/UBUF-type iterator is to be used, it should be converted to a + * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some + * way. + */ +static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, + struct smb_extract_to_rdma *rdma) +{ + ssize_t ret; + int before = rdma->nr_sge; + + switch (iov_iter_type(iter)) { + case ITER_BVEC: + ret = smb_extract_bvec_to_rdma(iter, rdma, len); + break; + case ITER_KVEC: + ret = smb_extract_kvec_to_rdma(iter, rdma, len); + break; + case ITER_XARRAY: + ret = smb_extract_xarray_to_rdma(iter, rdma, len); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + + if (ret > 0) { + iov_iter_advance(iter, ret); + } else if (ret < 0) { + while (rdma->nr_sge > before) { + struct ib_sge *sge = &rdma->sge[rdma->nr_sge--]; + + ib_dma_unmap_single(rdma->device, sge->addr, sge->length, + rdma->direction); + sge->addr = 0; + } + } + + return ret; +} diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h index 207ef979cd51..83f239f376f0 100644 --- a/fs/cifs/smbdirect.h +++ b/fs/cifs/smbdirect.h @@ -288,8 +288,7 @@ struct smbd_mr { struct list_head list; enum mr_state state; struct ib_mr *mr; - struct scatterlist *sgl; - int sgl_count; + struct sg_table sgt; enum dma_data_direction dir; union { struct ib_reg_wr wr; @@ -302,8 +301,8 @@ struct smbd_mr { /* Interfaces to register and deregister MR for RDMA read/write */ struct smbd_mr *smbd_register_mr( - struct smbd_connection *info, struct page *pages[], int num_pages, - int offset, int tailsz, bool writing, bool need_invalidate); + struct smbd_connection *info, struct iov_iter *iter, + bool writing, bool need_invalidate); int smbd_deregister_mr(struct smbd_mr *mr); #else diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 3851d0aaa288..b42050c68e6c 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -270,26 +270,7 @@ smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst) for (i = 0; i < nvec; i++) buflen += iov[i].iov_len; - /* - * Add in the page array if there is one. The caller needs to make - * sure rq_offset and rq_tailsz are set correctly. If a buffer of - * multiple pages ends at page boundary, rq_tailsz needs to be set to - * PAGE_SIZE. - */ - if (rqst->rq_npages) { - if (rqst->rq_npages == 1) - buflen += rqst->rq_tailsz; - else { - /* - * If there is more than one page, calculate the - * buffer length based on rq_offset and rq_tailsz - */ - buflen += rqst->rq_pagesz * (rqst->rq_npages - 1) - - rqst->rq_offset; - buflen += rqst->rq_tailsz; - } - } - + buflen += iov_iter_count(&rqst->rq_iter); return buflen; } @@ -376,23 +357,15 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, total_len += sent; - /* now walk the page array and send each page in it */ - for (i = 0; i < rqst[j].rq_npages; i++) { - struct bio_vec bvec; - - bvec.bv_page = rqst[j].rq_pages[i]; - rqst_page_get_length(&rqst[j], i, &bvec.bv_len, - &bvec.bv_offset); - - iov_iter_bvec(&smb_msg.msg_iter, ITER_SOURCE, - &bvec, 1, bvec.bv_len); + if (iov_iter_count(&rqst[j].rq_iter) > 0) { + smb_msg.msg_iter = rqst[j].rq_iter; rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) break; - total_len += sent; } - } + +} unmask: sigprocmask(SIG_SETMASK, &oldmask, NULL); @@ -1034,15 +1007,40 @@ cifs_cancelled_callback(struct mid_q_entry *mid) struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) { uint index = 0; + unsigned int min_in_flight = UINT_MAX, max_in_flight = 0; + struct TCP_Server_Info *server = NULL; + int i; if (!ses) return NULL; - /* round robin */ - index = (uint)atomic_inc_return(&ses->chan_seq); - spin_lock(&ses->chan_lock); - index %= ses->chan_count; + for (i = 0; i < ses->chan_count; i++) { + server = ses->chans[i].server; + if (!server) + continue; + + /* + * strictly speaking, we should pick up req_lock to read + * server->in_flight. But it shouldn't matter much here if we + * race while reading this data. The worst that can happen is + * that we could use a channel that's not least loaded. Avoiding + * taking the lock could help reduce wait time, which is + * important for this function + */ + if (server->in_flight < min_in_flight) { + min_in_flight = server->in_flight; + index = i; + } + if (server->in_flight > max_in_flight) + max_in_flight = server->in_flight; + } + + /* if all channels are equally loaded, fall back to round-robin */ + if (min_in_flight == max_in_flight) { + index = (uint)atomic_inc_return(&ses->chan_seq); + index %= ses->chan_count; + } spin_unlock(&ses->chan_lock); return ses->chans[index].server; @@ -1640,11 +1638,11 @@ int cifs_discard_remaining_data(struct TCP_Server_Info *server) { unsigned int rfclen = server->pdu_size; - int remaining = rfclen + HEADER_PREAMBLE_SIZE(server) - + size_t remaining = rfclen + HEADER_PREAMBLE_SIZE(server) - server->total_read; while (remaining > 0) { - int length; + ssize_t length; length = cifs_discard_from_socket(server, min_t(size_t, remaining, @@ -1790,10 +1788,15 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return cifs_readv_discard(server, mid); } - length = rdata->read_into_pages(server, rdata, data_len); - if (length < 0) - return length; - +#ifdef CONFIG_CIFS_SMB_DIRECT + if (rdata->mr) + length = data_len; /* An RDMA read is already done. */ + else +#endif + length = cifs_read_iter_from_socket(server, &rdata->iter, + data_len); + if (length > 0) + rdata->got_bytes += length; server->total_read += length; cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 5f2fb2fd2e37..50e762fa1a14 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -89,7 +89,7 @@ static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon, } static int cifs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index 9be281bbcc06..dd6277d87afb 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -46,12 +46,12 @@ extern const struct file_operations coda_ioctl_operations; /* operations shared over more than one file */ int coda_open(struct inode *i, struct file *f); int coda_release(struct inode *i, struct file *f); -int coda_permission(struct user_namespace *mnt_userns, struct inode *inode, +int coda_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int coda_revalidate_inode(struct inode *); -int coda_getattr(struct user_namespace *, const struct path *, struct kstat *, +int coda_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -int coda_setattr(struct user_namespace *, struct dentry *, struct iattr *); +int coda_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); /* this file: helpers */ char *coda_f2s(struct CodaFid *f); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 328d7a684b63..8450b1bd354b 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -73,7 +73,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig } -int coda_permission(struct user_namespace *mnt_userns, struct inode *inode, +int coda_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int error; @@ -133,7 +133,7 @@ static inline void coda_dir_drop_nlink(struct inode *dir) } /* creation routines: create, mknod, mkdir, link, symlink */ -static int coda_create(struct user_namespace *mnt_userns, struct inode *dir, +static int coda_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *de, umode_t mode, bool excl) { int error; @@ -166,7 +166,7 @@ err_out: return error; } -static int coda_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *de, umode_t mode) { struct inode *inode; @@ -228,7 +228,7 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode, } -static int coda_symlink(struct user_namespace *mnt_userns, +static int coda_symlink(struct mnt_idmap *idmap, struct inode *dir_inode, struct dentry *de, const char *symname) { @@ -295,7 +295,7 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) } /* rename */ -static int coda_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int coda_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 2185328b65c7..d661e6cf17ac 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -251,16 +251,16 @@ static void coda_evict_inode(struct inode *inode) coda_cache_clear_inode(inode); } -int coda_getattr(struct user_namespace *mnt_userns, const struct path *path, +int coda_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { int err = coda_revalidate_inode(d_inode(path->dentry)); if (!err) - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); return err; } -int coda_setattr(struct user_namespace *mnt_userns, struct dentry *de, +int coda_setattr(struct mnt_idmap *idmap, struct dentry *de, struct iattr *iattr) { struct inode *inode = d_inode(de); diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index cb9fd59a688c..36e35c15561a 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -24,7 +24,7 @@ #include "coda_linux.h" /* pioctl ops */ -static int coda_ioctl_permission(struct user_namespace *mnt_userns, +static int coda_ioctl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); static long coda_pioctl(struct file *filp, unsigned int cmd, unsigned long user_data); @@ -41,7 +41,7 @@ const struct file_operations coda_ioctl_operations = { }; /* the coda pioctl inode ops */ -static int coda_ioctl_permission(struct user_namespace *mnt_userns, +static int coda_ioctl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return (mask & MAY_EXEC) ? -EACCES : 0; diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 59f6cfd06f96..cd6a3721f6f6 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -791,7 +791,7 @@ static int coda_upcall(struct venus_comm *vcp, sig_req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); if (!sig_req) goto exit; - sig_inputArgs = kvzalloc(sizeof(struct coda_in_hdr), GFP_KERNEL); + sig_inputArgs = kvzalloc(sizeof(*sig_inputArgs), GFP_KERNEL); if (!sig_inputArgs) { kfree(sig_req); goto exit; diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index c0395363eab9..e710a1782382 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -77,7 +77,7 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); -extern int configfs_setattr(struct user_namespace *mnt_userns, +extern int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); extern struct dentry *configfs_pin_fs(void); @@ -91,7 +91,7 @@ extern const struct inode_operations configfs_root_inode_operations; extern const struct inode_operations configfs_symlink_inode_operations; extern const struct dentry_operations configfs_dentry_ops; -extern int configfs_symlink(struct user_namespace *mnt_userns, +extern int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname); extern int configfs_unlink(struct inode *dir, struct dentry *dentry); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index ec6519e1ca3b..4afcbbe63e68 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1251,7 +1251,7 @@ out_root_unlock: } EXPORT_SYMBOL(configfs_depend_item_unlocked); -static int configfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int ret = 0; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index b601610e9907..1c15edbe70ff 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -32,7 +32,7 @@ static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; -int configfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode * inode = d_inode(dentry); @@ -60,7 +60,7 @@ int configfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* attributes were changed atleast once in past */ - error = simple_setattr(mnt_userns, dentry, iattr); + error = simple_setattr(idmap, dentry, iattr); if (error) return error; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 0623c3edcfb9..69133ec1fac2 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -137,7 +137,7 @@ static int get_target(const char *symname, struct path *path, } -int configfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int ret; @@ -196,7 +196,7 @@ int configfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (dentry->d_inode || d_unhashed(dentry)) ret = -EEXIST; else - ret = inode_permission(&init_user_ns, dir, + ret = inode_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); if (!ret) ret = type->ct_item_ops->allow_link(parent_item, target_item); diff --git a/fs/coredump.c b/fs/coredump.c index de78bde2991b..5df1e6e1eb2b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -644,7 +644,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) goto close_fail; } } else { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode; int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | O_LARGEFILE | O_EXCL; @@ -722,8 +722,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) * a process dumps core while its cwd is e.g. on a vfat * filesystem. */ - mnt_userns = file_mnt_user_ns(cprm.file); - if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), + idmap = file_mnt_idmap(cprm.file); + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) { pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n", cn.corename); @@ -736,7 +736,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) } if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) goto close_fail; - if (do_truncate(mnt_userns, cprm.file->f_path.dentry, + if (do_truncate(idmap, cprm.file->f_path.dentry, 0, 0, cprm.file)) goto close_fail; } @@ -838,13 +838,33 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) } } +int dump_emit(struct coredump_params *cprm, const void *addr, int nr) +{ + if (cprm->to_skip) { + if (!__dump_skip(cprm, cprm->to_skip)) + return 0; + cprm->to_skip = 0; + } + return __dump_emit(cprm, addr, nr); +} +EXPORT_SYMBOL(dump_emit); + +void dump_skip_to(struct coredump_params *cprm, unsigned long pos) +{ + cprm->to_skip = pos - cprm->pos; +} +EXPORT_SYMBOL(dump_skip_to); + +void dump_skip(struct coredump_params *cprm, size_t nr) +{ + cprm->to_skip += nr; +} +EXPORT_SYMBOL(dump_skip); + +#ifdef CONFIG_ELF_CORE static int dump_emit_page(struct coredump_params *cprm, struct page *page) { - struct bio_vec bvec = { - .bv_page = page, - .bv_offset = 0, - .bv_len = PAGE_SIZE, - }; + struct bio_vec bvec; struct iov_iter iter; struct file *file = cprm->file; loff_t pos; @@ -860,6 +880,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) if (dump_interrupted()) return 0; pos = file->f_pos; + bvec_set_page(&bvec, page, PAGE_SIZE, 0); iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); n = __kernel_write_iter(cprm->file, &iter, &pos); if (n != PAGE_SIZE) @@ -871,30 +892,6 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) return 1; } -int dump_emit(struct coredump_params *cprm, const void *addr, int nr) -{ - if (cprm->to_skip) { - if (!__dump_skip(cprm, cprm->to_skip)) - return 0; - cprm->to_skip = 0; - } - return __dump_emit(cprm, addr, nr); -} -EXPORT_SYMBOL(dump_emit); - -void dump_skip_to(struct coredump_params *cprm, unsigned long pos) -{ - cprm->to_skip = pos - cprm->pos; -} -EXPORT_SYMBOL(dump_skip_to); - -void dump_skip(struct coredump_params *cprm, size_t nr) -{ - cprm->to_skip += nr; -} -EXPORT_SYMBOL(dump_skip); - -#ifdef CONFIG_ELF_CORE int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len) { @@ -1111,14 +1108,14 @@ whole: * Helper function for iterating across a vma list. It ensures that the caller * will visit `gate_vma' prior to terminating the search. */ -static struct vm_area_struct *coredump_next_vma(struct ma_state *mas, +static struct vm_area_struct *coredump_next_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *gate_vma) { if (gate_vma && (vma == gate_vma)) return NULL; - vma = mas_next(mas, ULONG_MAX); + vma = vma_next(vmi); if (vma) return vma; return gate_vma; @@ -1146,7 +1143,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) { struct vm_area_struct *gate_vma, *vma = NULL; struct mm_struct *mm = current->mm; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); int i = 0; /* @@ -1167,7 +1164,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) return false; } - while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) { + while ((vma = coredump_next_vma(&vmi, vma, gate_vma)) != NULL) { struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index d98cef0dbb6b..4612c9bbf102 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -38,7 +38,7 @@ config CRAMFS_MTD default y if !CRAMFS_BLOCKDEV help This option allows the CramFs driver to load data directly from - a linear adressed memory range (usually non volatile memory + a linear addressed memory range (usually non-volatile memory like flash) instead of going through the block device layer. This saves some memory since no intermediate buffering is necessary. diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 61ccf7722fc3..e3d168911dbe 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -238,8 +238,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, struct page *page = pages[i]; if (page) { - memcpy(data, kmap(page), PAGE_SIZE); - kunmap(page); + memcpy_from_page(data, page, 0, PAGE_SIZE); put_page(page); } else memset(data, 0, PAGE_SIZE); @@ -408,7 +407,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) * unpopulated ptes via cramfs_read_folio(). */ int i; - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); for (i = 0; i < pages && !ret; i++) { vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; @@ -437,7 +436,7 @@ bailout: static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) { - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS; } static unsigned long cramfs_physmem_get_unmapped_area(struct file *file, @@ -815,7 +814,7 @@ static int cramfs_read_folio(struct file *file, struct folio *folio) maxblock = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; bytes_filled = 0; - pgdata = kmap(page); + pgdata = kmap_local_page(page); if (page->index < maxblock) { struct super_block *sb = inode->i_sb; @@ -903,13 +902,13 @@ static int cramfs_read_folio(struct file *file, struct folio *folio) memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled); flush_dcache_page(page); - kunmap(page); + kunmap_local(pgdata); SetPageUptodate(page); unlock_page(page); return 0; err: - kunmap(page); + kunmap_local(pgdata); ClearPageUptodate(page); SetPageError(page); unlock_page(page); diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 1b4403136d05..d57d0a020f71 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -30,13 +30,11 @@ */ bool fscrypt_decrypt_bio(struct bio *bio) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - int err = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len, - bv->bv_offset); + bio_for_each_folio_all(fi, bio) { + int err = fscrypt_decrypt_pagecache_blocks(fi.folio, fi.length, + fi.offset); if (err) { bio->bi_status = errno_to_blk_status(err); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index e78be66bbf01..bf642479269a 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -237,41 +237,43 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); /** * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a - * pagecache page - * @page: The locked pagecache page containing the block(s) to decrypt + * pagecache folio + * @folio: The locked pagecache folio containing the block(s) to decrypt * @len: Total size of the block(s) to decrypt. Must be a nonzero * multiple of the filesystem's block size. - * @offs: Byte offset within @page of the first block to decrypt. Must be + * @offs: Byte offset within @folio of the first block to decrypt. Must be * a multiple of the filesystem's block size. * - * The specified block(s) are decrypted in-place within the pagecache page, - * which must still be locked and not uptodate. Normally, blocksize == - * PAGE_SIZE and the whole page is decrypted at once. + * The specified block(s) are decrypted in-place within the pagecache folio, + * which must still be locked and not uptodate. * * This is for use by the filesystem's ->readahead() method. * * Return: 0 on success; -errno on failure */ -int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len, - unsigned int offs) +int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, + size_t offs) { - const struct inode *inode = page->mapping->host; + const struct inode *inode = folio->mapping->host; const unsigned int blockbits = inode->i_blkbits; const unsigned int blocksize = 1 << blockbits; - u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) + + u64 lblk_num = ((u64)folio->index << (PAGE_SHIFT - blockbits)) + (offs >> blockbits); - unsigned int i; + size_t i; int err; - if (WARN_ON_ONCE(!PageLocked(page))) + if (WARN_ON_ONCE(!folio_test_locked(folio))) return -EINVAL; if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize))) return -EINVAL; for (i = offs; i < offs + len; i += blocksize, lblk_num++) { + struct page *page = folio_page(folio, i >> PAGE_SHIFT); + err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, - page, blocksize, i, GFP_NOFS); + page, blocksize, i & ~PAGE_MASK, + GFP_NOFS); if (err) return err; } diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 316a778cec0f..0fec2dfc36eb 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -573,6 +573,9 @@ fscrypt_find_master_key(struct super_block *sb, int fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); +int fscrypt_add_test_dummy_key(struct super_block *sb, + struct fscrypt_key_specifier *key_spec); + int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); @@ -651,6 +654,7 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2); int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, struct fscrypt_key_specifier *key_spec); +const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb); bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode); int fscrypt_policy_from_context(union fscrypt_policy *policy_u, diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 78dd2ff306bd..78086f8dbda5 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -211,10 +211,6 @@ static int allocate_filesystem_keyring(struct super_block *sb) * are still available at this time; this is important because after user file * accesses have been allowed, this function may need to evict keys from the * keyslots of an inline crypto engine, which requires the block device(s). - * - * This is also called when the super_block is being freed. This is needed to - * avoid a memory leak if mounting fails after the "test_dummy_encryption" - * option was processed, as in that case the unmount-time call isn't made. */ void fscrypt_destroy_keyring(struct super_block *sb) { @@ -778,34 +774,26 @@ out: /** * fscrypt_add_test_dummy_key() - add the test dummy encryption key * @sb: the filesystem instance to add the key to - * @dummy_policy: the encryption policy for test_dummy_encryption + * @key_spec: the key specifier of the test dummy encryption key * - * If needed, add the key for the test_dummy_encryption mount option to the - * filesystem. To prevent misuse of this mount option, a per-boot random key is - * used instead of a hardcoded one. This makes it so that any encrypted files - * created using this option won't be accessible after a reboot. + * Add the key for the test_dummy_encryption mount option to the filesystem. To + * prevent misuse of this mount option, a per-boot random key is used instead of + * a hardcoded one. This makes it so that any encrypted files created using + * this option won't be accessible after a reboot. * * Return: 0 on success, -errno on failure */ int fscrypt_add_test_dummy_key(struct super_block *sb, - const struct fscrypt_dummy_policy *dummy_policy) + struct fscrypt_key_specifier *key_spec) { - const union fscrypt_policy *policy = dummy_policy->policy; - struct fscrypt_key_specifier key_spec; struct fscrypt_master_key_secret secret; int err; - if (!policy) - return 0; - err = fscrypt_policy_to_key_spec(policy, &key_spec); - if (err) - return err; fscrypt_get_test_dummy_secret(&secret); - err = add_master_key(sb, &secret, &key_spec); + err = add_master_key(sb, &secret, key_spec); wipe_master_key_secret(&secret); return err; } -EXPORT_SYMBOL_GPL(fscrypt_add_test_dummy_key); /* * Verify that the current user has added a master key with the given identifier diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 94757ccd3056..aa94fba9d17e 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -438,6 +438,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, bool need_dirhash_key, struct fscrypt_master_key **mk_ret) { + struct super_block *sb = ci->ci_inode->i_sb; struct fscrypt_key_specifier mk_spec; struct fscrypt_master_key *mk; int err; @@ -450,8 +451,26 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, if (err) return err; - mk = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); - if (!mk) { + mk = fscrypt_find_master_key(sb, &mk_spec); + if (unlikely(!mk)) { + const union fscrypt_policy *dummy_policy = + fscrypt_get_dummy_policy(sb); + + /* + * Add the test_dummy_encryption key on-demand. In principle, + * it should be added at mount time. Do it here instead so that + * the individual filesystems don't need to worry about adding + * this key at mount time and cleaning up on mount failure. + */ + if (dummy_policy && + fscrypt_policies_equal(dummy_policy, &ci->ci_policy)) { + err = fscrypt_add_test_dummy_key(sb, &mk_spec); + if (err) + return err; + mk = fscrypt_find_master_key(sb, &mk_spec); + } + } + if (unlikely(!mk)) { if (ci->ci_policy.version != FSCRYPT_POLICY_V1) return -ENOKEY; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 893661b52376..3b5fcb6402ea 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -53,8 +53,7 @@ int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, } } -static const union fscrypt_policy * -fscrypt_get_dummy_policy(struct super_block *sb) +const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb) { if (!sb->s_cop->get_dummy_policy) return NULL; @@ -506,7 +505,7 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) return -EFAULT; policy.version = version; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -1271,8 +1271,9 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) if (ret < 0) goto out_unlock; - ret = copy_mc_to_kernel(daddr, saddr, length); - if (ret) + if (copy_mc_to_kernel(daddr, saddr, length) == 0) + ret = length; + else ret = -EIO; out_unlock: diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index b54f470e0d03..1f971c880dde 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -899,6 +899,7 @@ ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf, return ret; } +EXPORT_SYMBOL_GPL(debugfs_create_str); static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 2e8e112b1993..3f81f73c241a 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -42,7 +42,7 @@ static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; * so that we can use the file mode as part of a heuristic to determine whether * to lock down individual files. */ -static int debugfs_setattr(struct user_namespace *mnt_userns, +static int debugfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { int ret; @@ -52,7 +52,7 @@ static int debugfs_setattr(struct user_namespace *mnt_userns, if (ret) return ret; } - return simple_setattr(&init_user_ns, dentry, ia); + return simple_setattr(&nop_mnt_idmap, dentry, ia); } static const struct inode_operations debugfs_file_inode_operations = { @@ -802,8 +802,8 @@ EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove); * exist for rename to succeed. * * This function will return a pointer to old_dentry (which is updated to - * reflect renaming) if it succeeds. If an error occurs, %NULL will be - * returned. + * reflect renaming) if it succeeds. If an error occurs, ERR_PTR(-ERROR) + * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. @@ -837,7 +837,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, take_dentry_name_snapshot(&old_name, old_dentry); - error = simple_rename(&init_user_ns, d_inode(old_dir), old_dentry, + error = simple_rename(&nop_mnt_idmap, d_inode(old_dir), old_dentry, d_inode(new_dir), dentry, 0); if (error) { release_dentry_name_snapshot(&old_name); diff --git a/fs/direct-io.c b/fs/direct-io.c index 03d381377ae1..ab0d7ea89813 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -558,30 +558,6 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) return ret; } -/* - * Create workqueue for deferred direct IO completions. We allocate the - * workqueue when it's first needed. This avoids creating workqueue for - * filesystems that don't need it and also allows us to create the workqueue - * late enough so the we can include s_id in the name of the workqueue. - */ -int sb_init_dio_done_wq(struct super_block *sb) -{ - struct workqueue_struct *old; - struct workqueue_struct *wq = alloc_workqueue("dio/%s", - WQ_MEM_RECLAIM, 0, - sb->s_id); - if (!wq) - return -ENOMEM; - /* - * This has to be atomic as more DIOs can race to create the workqueue - */ - old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); - /* Someone created workqueue before us? Free ours... */ - if (old) - destroy_workqueue(wq); - return 0; -} - static int dio_set_defer_completion(struct dio *dio) { struct super_block *sb = dio->inode->i_sb; diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index 1105ce3c80cb..b3b86dbdc187 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -4,7 +4,6 @@ menuconfig DLM depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP - select SRCU help A general purpose distributed lock manager for kernel or userspace applications. diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d0b4e2181a5f..7325acbd1af7 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -215,9 +215,9 @@ static int do_uevent(struct dlm_ls *ls, int in) return ls->ls_uevent_result; } -static int dlm_uevent(struct kobject *kobj, struct kobj_uevent_env *env) +static int dlm_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { - struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); + const struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name); return 0; @@ -381,23 +381,23 @@ static int threads_start(void) { int error; - error = dlm_scand_start(); + /* Thread for sending/receiving messages for all lockspace's */ + error = dlm_midcomms_start(); if (error) { - log_print("cannot start dlm_scand thread %d", error); + log_print("cannot start dlm midcomms %d", error); goto fail; } - /* Thread for sending/receiving messages for all lockspace's */ - error = dlm_midcomms_start(); + error = dlm_scand_start(); if (error) { - log_print("cannot start dlm midcomms %d", error); - goto scand_fail; + log_print("cannot start dlm_scand thread %d", error); + goto midcomms_fail; } return 0; - scand_fail: - dlm_scand_stop(); + midcomms_fail: + dlm_midcomms_stop(); fail: return error; } @@ -572,7 +572,7 @@ static int new_lockspace(const char *name, const char *cluster, spin_lock_init(&ls->ls_rcom_spin); get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t)); ls->ls_recover_status = 0; - ls->ls_recover_seq = 0; + ls->ls_recover_seq = get_random_u64(); ls->ls_recover_args = NULL; init_rwsem(&ls->ls_in_recovery); init_rwsem(&ls->ls_recv_active); @@ -820,6 +820,9 @@ static int release_lockspace(struct dlm_ls *ls, int force) return rv; } + if (ls_count == 1) + dlm_midcomms_version_wait(); + dlm_device_deregister(ls); if (force < 3 && dlm_user_daemon_available()) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 4450721ec83c..a9b14f81d655 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -54,6 +54,7 @@ #include <net/ipv6.h> #include <trace/events/dlm.h> +#include <trace/events/sock.h> #include "dlm_internal.h" #include "lowcomms.h" @@ -61,6 +62,7 @@ #include "memory.h" #include "config.h" +#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(5000) #define NEEDED_RMEM (4*1024*1024) struct connection { @@ -99,6 +101,7 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* receive worker */ struct work_struct swork; /* send worker */ + wait_queue_head_t shutdown_wait; unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE]; int rx_leftover; int mark; @@ -282,6 +285,7 @@ static void dlm_con_init(struct connection *con, int nodeid) INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); spin_lock_init(&con->addrs_lock); + init_waitqueue_head(&con->shutdown_wait); } /* @@ -499,6 +503,8 @@ static void lowcomms_data_ready(struct sock *sk) { struct connection *con = sock2con(sk); + trace_sk_data_ready(sk); + set_bit(CF_RECV_INTR, &con->flags); lowcomms_queue_rwork(con); } @@ -530,6 +536,8 @@ static void lowcomms_state_change(struct sock *sk) static void lowcomms_listen_data_ready(struct sock *sk) { + trace_sk_data_ready(sk); + queue_work(io_workqueue, &listen_con.rwork); } @@ -790,6 +798,43 @@ static void close_connection(struct connection *con, bool and_other) up_write(&con->sock_lock); } +static void shutdown_connection(struct connection *con, bool and_other) +{ + int ret; + + if (con->othercon && and_other) + shutdown_connection(con->othercon, false); + + flush_workqueue(io_workqueue); + down_read(&con->sock_lock); + /* nothing to shutdown */ + if (!con->sock) { + up_read(&con->sock_lock); + return; + } + + ret = kernel_sock_shutdown(con->sock, SHUT_WR); + up_read(&con->sock_lock); + if (ret) { + log_print("Connection %p failed to shutdown: %d will force close", + con, ret); + goto force_close; + } else { + ret = wait_event_timeout(con->shutdown_wait, !con->sock, + DLM_SHUTDOWN_WAIT_TIMEOUT); + if (ret == 0) { + log_print("Connection %p shutdown timed out, will force close", + con); + goto force_close; + } + } + + return; + +force_close: + close_connection(con, false); +} + static struct processqueue_entry *new_processqueue_entry(int nodeid, int buflen) { @@ -1488,6 +1533,7 @@ static void process_recv_sockets(struct work_struct *work) break; case DLM_IO_EOF: close_connection(con, false); + wake_up(&con->shutdown_wait); /* CF_RECV_PENDING cleared */ break; case DLM_IO_RESCHED: @@ -1695,6 +1741,9 @@ static int work_start(void) void dlm_lowcomms_shutdown(void) { + struct connection *con; + int i, idx; + /* stop lowcomms_listen_data_ready calls */ lock_sock(listen_con.sock->sk); listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready; @@ -1703,29 +1752,20 @@ void dlm_lowcomms_shutdown(void) cancel_work_sync(&listen_con.rwork); dlm_close_sock(&listen_con.sock); - flush_workqueue(process_workqueue); -} - -void dlm_lowcomms_shutdown_node(int nodeid, bool force) -{ - struct connection *con; - int idx; - idx = srcu_read_lock(&connections_srcu); - con = nodeid2con(nodeid, 0); - if (WARN_ON_ONCE(!con)) { - srcu_read_unlock(&connections_srcu, idx); - return; - } + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + shutdown_connection(con, true); + stop_connection_io(con); + flush_workqueue(process_workqueue); + close_connection(con, true); - flush_work(&con->swork); - stop_connection_io(con); - WARN_ON_ONCE(!force && !list_empty(&con->writequeue)); - close_connection(con, true); - clean_one_writequeue(con); - if (con->othercon) - clean_one_writequeue(con->othercon); - allow_connection_io(con); + clean_one_writequeue(con); + if (con->othercon) + clean_one_writequeue(con->othercon); + allow_connection_io(con); + } + } srcu_read_unlock(&connections_srcu, idx); } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index eb7a08641fcf..cdbaa452fc05 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -51,7 +51,7 @@ int __init dlm_memory_init(void) cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback), __alignof__(struct dlm_callback), 0, NULL); - if (!rsb_cache) + if (!cb_cache) goto cb; return 0; diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index fc015a6abe17..c02c43e4980a 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -146,8 +146,8 @@ /* init value for sequence numbers for testing purpose only e.g. overflows */ #define DLM_SEQ_INIT 0 -/* 3 minutes wait to sync ending of dlm */ -#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(3 * 60 * 1000) +/* 5 seconds wait to sync ending of dlm */ +#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(5000) #define DLM_VERSION_NOT_SET 0 struct midcomms_node { @@ -375,7 +375,7 @@ static int dlm_send_ack(int nodeid, uint32_t seq) struct dlm_msg *msg; char *ppc; - msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_NOFS, &ppc, + msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_ATOMIC, &ppc, NULL, NULL); if (!msg) return -ENOMEM; @@ -402,10 +402,11 @@ static int dlm_send_fin(struct midcomms_node *node, struct dlm_mhandle *mh; char *ppc; - mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_NOFS, &ppc); + mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_ATOMIC, &ppc); if (!mh) return -ENOMEM; + set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); mh->ack_rcv = ack_rcv; m_header = (struct dlm_header *)ppc; @@ -417,7 +418,6 @@ static int dlm_send_fin(struct midcomms_node *node, pr_debug("sending fin msg to node %d\n", node->nodeid); dlm_midcomms_commit_mhandle(mh, NULL, 0); - set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); return 0; } @@ -467,7 +467,7 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) break; default: spin_unlock(&node->state_lock); - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; @@ -498,18 +498,14 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, switch (p->header.h_cmd) { case DLM_FIN: - /* send ack before fin */ - dlm_send_ack(node->nodeid, node->seq_next); - spin_lock(&node->state_lock); pr_debug("receive fin msg from node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); switch (node->state) { case DLM_ESTABLISHED: - node->state = DLM_CLOSE_WAIT; - pr_debug("switch node %d to state %s\n", - node->nodeid, dlm_state_str(node->state)); + dlm_send_ack(node->nodeid, node->seq_next); + /* passive shutdown DLM_LAST_ACK case 1 * additional we check if the node is used by * cluster manager events at all. @@ -518,34 +514,38 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, node->state = DLM_LAST_ACK; pr_debug("switch node %d to state %s case 1\n", node->nodeid, dlm_state_str(node->state)); - spin_unlock(&node->state_lock); - goto send_fin; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + } else { + node->state = DLM_CLOSE_WAIT; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); } break; case DLM_FIN_WAIT1: + dlm_send_ack(node->nodeid, node->seq_next); node->state = DLM_CLOSING; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; case DLM_FIN_WAIT2: + dlm_send_ack(node->nodeid, node->seq_next); midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); - wake_up(&node->shutdown_wait); break; case DLM_LAST_ACK: /* probably remove_member caught it, do nothing */ break; default: spin_unlock(&node->state_lock); - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; } spin_unlock(&node->state_lock); - - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); break; default: WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); @@ -564,12 +564,6 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d", seq, node->seq_next, node->nodeid); } - - return; - -send_fin: - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); - dlm_send_fin(node, dlm_pas_fin_ack_rcv); } static struct midcomms_node * @@ -612,16 +606,8 @@ dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p, case DLM_ESTABLISHED: break; default: - /* some invalid state passive shutdown - * was failed, we try to reset and - * hope it will go on. - */ - log_print("reset node %d because shutdown stuck", - node->nodeid); - - midcomms_node_reset(node); - node->state = DLM_ESTABLISHED; - break; + spin_unlock(&node->state_lock); + return NULL; } spin_unlock(&node->state_lock); } @@ -671,6 +657,7 @@ static int dlm_midcomms_version_check_3_2(struct midcomms_node *node) switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_2; + wake_up(&node->shutdown_wait); log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, node->nodeid); break; @@ -840,6 +827,7 @@ static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_1; + wake_up(&node->shutdown_wait); log_print("version 0x%08x for node %d detected", DLM_VERSION_3_1, node->nodeid); break; @@ -1214,8 +1202,15 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, dlm_free_mhandle(mh); break; case DLM_VERSION_3_2: + /* held rcu read lock here, because we sending the + * dlm message out, when we do that we could receive + * an ack back which releases the mhandle and we + * get a use after free. + */ + rcu_read_lock(); dlm_midcomms_commit_msg_3_2(mh, name, namelen); srcu_read_unlock(&nodes_srcu, mh->idx); + rcu_read_unlock(); break; default: srcu_read_unlock(&nodes_srcu, mh->idx); @@ -1266,7 +1261,6 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node) midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); - wake_up(&node->shutdown_wait); break; case DLM_CLOSED: /* not valid but somehow we got what we want */ @@ -1274,7 +1268,7 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node) break; default: spin_unlock(&node->state_lock); - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; @@ -1362,11 +1356,11 @@ void dlm_midcomms_remove_member(int nodeid) case DLM_CLOSE_WAIT: /* passive shutdown DLM_LAST_ACK case 2 */ node->state = DLM_LAST_ACK; - spin_unlock(&node->state_lock); - pr_debug("switch node %d to state %s case 2\n", node->nodeid, dlm_state_str(node->state)); - goto send_fin; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + break; case DLM_LAST_ACK: /* probably receive fin caught it, do nothing */ break; @@ -1374,7 +1368,7 @@ void dlm_midcomms_remove_member(int nodeid) /* already gone, do nothing */ break; default: - log_print("%s: unexpected state: %d\n", + log_print("%s: unexpected state: %d", __func__, node->state); break; } @@ -1382,12 +1376,6 @@ void dlm_midcomms_remove_member(int nodeid) spin_unlock(&node->state_lock); srcu_read_unlock(&nodes_srcu, idx); - return; - -send_fin: - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); - dlm_send_fin(node, dlm_pas_fin_ack_rcv); - srcu_read_unlock(&nodes_srcu, idx); } static void midcomms_node_release(struct rcu_head *rcu) @@ -1395,9 +1383,31 @@ static void midcomms_node_release(struct rcu_head *rcu) struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); + dlm_send_queue_flush(node); kfree(node); } +void dlm_midcomms_version_wait(void) +{ + struct midcomms_node *node; + int i, idx, ret; + + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + ret = wait_event_timeout(node->shutdown_wait, + node->version != DLM_VERSION_NOT_SET || + node->state == DLM_CLOSED || + test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), + DLM_SHUTDOWN_TIMEOUT); + if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) + pr_debug("version wait timed out for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + } + } + srcu_read_unlock(&nodes_srcu, idx); +} + static void midcomms_shutdown(struct midcomms_node *node) { int ret; @@ -1418,11 +1428,11 @@ static void midcomms_shutdown(struct midcomms_node *node) node->state = DLM_FIN_WAIT1; pr_debug("switch node %d to state %s case 2\n", node->nodeid, dlm_state_str(node->state)); + dlm_send_fin(node, dlm_act_fin_ack_rcv); break; case DLM_CLOSED: /* we have what we want */ - spin_unlock(&node->state_lock); - return; + break; default: /* busy to enter DLM_FIN_WAIT1, wait until passive * done in shutdown_wait to enter DLM_CLOSED. @@ -1431,29 +1441,20 @@ static void midcomms_shutdown(struct midcomms_node *node) } spin_unlock(&node->state_lock); - if (node->state == DLM_FIN_WAIT1) { - dlm_send_fin(node, dlm_act_fin_ack_rcv); - - if (DLM_DEBUG_FENCE_TERMINATION) - msleep(5000); - } + if (DLM_DEBUG_FENCE_TERMINATION) + msleep(5000); /* wait for other side dlm + fin */ ret = wait_event_timeout(node->shutdown_wait, node->state == DLM_CLOSED || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), DLM_SHUTDOWN_TIMEOUT); - if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) { + if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) pr_debug("active shutdown timed out for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); - midcomms_node_reset(node); - dlm_lowcomms_shutdown_node(node->nodeid, true); - return; - } - - pr_debug("active shutdown done for node %d with state %s\n", - node->nodeid, dlm_state_str(node->state)); - dlm_lowcomms_shutdown_node(node->nodeid, false); + else + pr_debug("active shutdown done for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); } void dlm_midcomms_shutdown(void) @@ -1461,8 +1462,6 @@ void dlm_midcomms_shutdown(void) struct midcomms_node *node; int i, idx; - dlm_lowcomms_shutdown(); - mutex_lock(&close_lock); idx = srcu_read_lock(&nodes_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { @@ -1480,6 +1479,8 @@ void dlm_midcomms_shutdown(void) } srcu_read_unlock(&nodes_srcu, idx); mutex_unlock(&close_lock); + + dlm_lowcomms_shutdown(); } int dlm_midcomms_close(int nodeid) diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index bea1cee4279c..9f8c9605013d 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -20,6 +20,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, gfp_t allocation, char **ppc); void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, int namelen); +void dlm_midcomms_version_wait(void); int dlm_midcomms_close(int nodeid); int dlm_midcomms_start(void); void dlm_midcomms_stop(void); diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 737f185aad8d..ed4357e62f35 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -4,6 +4,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/miscdevice.h> #include <linux/poll.h> #include <linux/dlm.h> diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index e3f5d7f3c8a0..c16f0d660cb7 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -260,22 +260,6 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, return i; } -struct extent_crypt_result { - struct completion completion; - int rc; -}; - -static void extent_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct extent_crypt_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->rc = rc; - complete(&ecr->completion); -} - /** * crypt_scatterlist * @crypt_stat: Pointer to the crypt_stat struct to initialize. @@ -293,7 +277,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, unsigned char *iv, int op) { struct skcipher_request *req = NULL; - struct extent_crypt_result ecr; + DECLARE_CRYPTO_WAIT(ecr); int rc = 0; if (unlikely(ecryptfs_verbosity > 0)) { @@ -303,8 +287,6 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, crypt_stat->key_size); } - init_completion(&ecr.completion); - mutex_lock(&crypt_stat->cs_tfm_mutex); req = skcipher_request_alloc(crypt_stat->tfm, GFP_NOFS); if (!req) { @@ -315,7 +297,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - extent_crypt_complete, &ecr); + crypto_req_done, &ecr); /* Consider doing this once, when the file is opened */ if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) { rc = crypto_skcipher_setkey(crypt_stat->tfm, crypt_stat->key, @@ -334,13 +316,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, skcipher_request_set_crypt(req, src_sg, dst_sg, size, iv); rc = op == ENCRYPT ? crypto_skcipher_encrypt(req) : crypto_skcipher_decrypt(req); - if (rc == -EINPROGRESS || rc == -EBUSY) { - struct extent_crypt_result *ecr = req->base.data; - - wait_for_completion(&ecr->completion); - rc = ecr->rc; - reinit_completion(&ecr->completion); - } + rc = crypto_wait_req(rc, &ecr); out: skcipher_request_free(req); return rc; @@ -1105,7 +1081,7 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, } inode_lock(lower_inode); - rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode, + rc = __vfs_setxattr(&nop_mnt_idmap, lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, page_virt, size, 0); if (!rc && ecryptfs_inode) fsstack_copy_attr_all(ecryptfs_inode, lower_inode); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index f3cd00fac9c3..144ace9e0dd9 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -139,7 +139,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, if (d_unhashed(lower_dentry)) rc = -EINVAL; else - rc = vfs_unlink(&init_user_ns, lower_dir, lower_dentry, + rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL); } if (rc) { @@ -180,7 +180,7 @@ ecryptfs_do_create(struct inode *directory_inode, rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir); if (!rc) - rc = vfs_create(&init_user_ns, lower_dir, + rc = vfs_create(&nop_mnt_idmap, lower_dir, lower_dentry, mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " @@ -191,7 +191,7 @@ ecryptfs_do_create(struct inode *directory_inode, inode = __ecryptfs_get_inode(d_inode(lower_dentry), directory_inode->i_sb); if (IS_ERR(inode)) { - vfs_unlink(&init_user_ns, lower_dir, lower_dentry, NULL); + vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL); goto out_lock; } fsstack_copy_attr_times(directory_inode, lower_dir); @@ -253,7 +253,7 @@ out: * Returns zero on success; non-zero on error condition */ static int -ecryptfs_create(struct user_namespace *mnt_userns, +ecryptfs_create(struct mnt_idmap *idmap, struct inode *directory_inode, struct dentry *ecryptfs_dentry, umode_t mode, bool excl) { @@ -434,7 +434,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir); if (!rc) - rc = vfs_link(lower_old_dentry, &init_user_ns, lower_dir, + rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir, lower_new_dentry, NULL); if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; @@ -456,7 +456,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) return ecryptfs_do_unlink(dir, dentry, d_inode(dentry)); } -static int ecryptfs_symlink(struct user_namespace *mnt_userns, +static int ecryptfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { @@ -478,7 +478,7 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns, strlen(symname)); if (rc) goto out_lock; - rc = vfs_symlink(&init_user_ns, lower_dir, lower_dentry, + rc = vfs_symlink(&nop_mnt_idmap, lower_dir, lower_dentry, encoded_symname); kfree(encoded_symname); if (rc || d_really_is_negative(lower_dentry)) @@ -495,7 +495,7 @@ out_lock: return rc; } -static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int rc; @@ -504,7 +504,7 @@ static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, rc = lock_parent(dentry, &lower_dentry, &lower_dir); if (!rc) - rc = vfs_mkdir(&init_user_ns, lower_dir, + rc = vfs_mkdir(&nop_mnt_idmap, lower_dir, lower_dentry, mode); if (rc || d_really_is_negative(lower_dentry)) goto out; @@ -533,7 +533,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) if (d_unhashed(lower_dentry)) rc = -EINVAL; else - rc = vfs_rmdir(&init_user_ns, lower_dir, lower_dentry); + rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry); } if (!rc) { clear_nlink(d_inode(dentry)); @@ -548,7 +548,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) } static int -ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { int rc; @@ -557,7 +557,7 @@ ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, rc = lock_parent(dentry, &lower_dentry, &lower_dir); if (!rc) - rc = vfs_mknod(&init_user_ns, lower_dir, + rc = vfs_mknod(&nop_mnt_idmap, lower_dir, lower_dentry, mode, dev); if (rc || d_really_is_negative(lower_dentry)) goto out; @@ -574,7 +574,7 @@ out: } static int -ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -616,10 +616,10 @@ ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto out_lock; } - rd.old_mnt_userns = &init_user_ns; + rd.old_mnt_idmap = &nop_mnt_idmap; rd.old_dir = d_inode(lower_old_dir_dentry); rd.old_dentry = lower_old_dentry; - rd.new_mnt_userns = &init_user_ns; + rd.new_mnt_idmap = &nop_mnt_idmap; rd.new_dir = d_inode(lower_new_dir_dentry); rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); @@ -856,7 +856,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); inode_lock(d_inode(lower_dentry)); - rc = notify_change(&init_user_ns, lower_dentry, + rc = notify_change(&nop_mnt_idmap, lower_dentry, &lower_ia, NULL); inode_unlock(d_inode(lower_dentry)); } @@ -864,16 +864,16 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) } static int -ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +ecryptfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { - return inode_permission(&init_user_ns, + return inode_permission(&nop_mnt_idmap, ecryptfs_inode_to_lower(inode), mask); } /** * ecryptfs_setattr - * @mnt_userns: user namespace of the target mount + * @idmap: idmap of the target mount * @dentry: dentry handle to the inode to modify * @ia: Structure with flags of what to change and values * @@ -884,7 +884,7 @@ ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode, * All other metadata changes will be passed right to the lower filesystem, * and we will just update our inode to look like the lower. */ -static int ecryptfs_setattr(struct user_namespace *mnt_userns, +static int ecryptfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { int rc = 0; @@ -939,7 +939,7 @@ static int ecryptfs_setattr(struct user_namespace *mnt_userns, } mutex_unlock(&crypt_stat->cs_mutex); - rc = setattr_prepare(&init_user_ns, dentry, ia); + rc = setattr_prepare(&nop_mnt_idmap, dentry, ia); if (rc) goto out; if (ia->ia_valid & ATTR_SIZE) { @@ -965,14 +965,14 @@ static int ecryptfs_setattr(struct user_namespace *mnt_userns, lower_ia.ia_valid &= ~ATTR_MODE; inode_lock(d_inode(lower_dentry)); - rc = notify_change(&init_user_ns, lower_dentry, &lower_ia, NULL); + rc = notify_change(&nop_mnt_idmap, lower_dentry, &lower_ia, NULL); inode_unlock(d_inode(lower_dentry)); out: fsstack_copy_attr_all(inode, lower_inode); return rc; } -static int ecryptfs_getattr_link(struct user_namespace *mnt_userns, +static int ecryptfs_getattr_link(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -982,7 +982,7 @@ static int ecryptfs_getattr_link(struct user_namespace *mnt_userns, mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { char *target; size_t targetsiz; @@ -998,7 +998,7 @@ static int ecryptfs_getattr_link(struct user_namespace *mnt_userns, return rc; } -static int ecryptfs_getattr(struct user_namespace *mnt_userns, +static int ecryptfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -1011,7 +1011,7 @@ static int ecryptfs_getattr(struct user_namespace *mnt_userns, if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); - generic_fillattr(&init_user_ns, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); stat->blocks = lower_stat.blocks; } return rc; @@ -1033,7 +1033,7 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, goto out; } inode_lock(lower_inode); - rc = __vfs_setxattr_locked(&init_user_ns, lower_dentry, name, value, size, flags, NULL); + rc = __vfs_setxattr_locked(&nop_mnt_idmap, lower_dentry, name, value, size, flags, NULL); inode_unlock(lower_inode); if (!rc && inode) fsstack_copy_attr_all(inode, lower_inode); @@ -1099,7 +1099,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, struct inode *inode, goto out; } inode_lock(lower_inode); - rc = __vfs_removexattr(&init_user_ns, lower_dentry, name); + rc = __vfs_removexattr(&nop_mnt_idmap, lower_dentry, name); inode_unlock(lower_inode); out: return rc; @@ -1110,26 +1110,26 @@ static int ecryptfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return vfs_fileattr_get(ecryptfs_dentry_to_lower(dentry), fa); } -static int ecryptfs_fileattr_set(struct user_namespace *mnt_userns, +static int ecryptfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); int rc; - rc = vfs_fileattr_set(&init_user_ns, lower_dentry, fa); + rc = vfs_fileattr_set(&nop_mnt_idmap, lower_dentry, fa); fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry)); return rc; } -static struct posix_acl *ecryptfs_get_acl(struct user_namespace *mnt_userns, +static struct posix_acl *ecryptfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { - return vfs_get_acl(mnt_userns, ecryptfs_dentry_to_lower(dentry), + return vfs_get_acl(idmap, ecryptfs_dentry_to_lower(dentry), posix_acl_xattr_name(type)); } -static int ecryptfs_set_acl(struct user_namespace *mnt_userns, +static int ecryptfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { @@ -1137,7 +1137,7 @@ static int ecryptfs_set_acl(struct user_namespace *mnt_userns, struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); struct inode *lower_inode = d_inode(lower_dentry); - rc = vfs_set_acl(&init_user_ns, lower_dentry, + rc = vfs_set_acl(&nop_mnt_idmap, lower_dentry, posix_acl_xattr_name(type), acl); if (!rc) fsstack_copy_attr_all(d_inode(dentry), lower_inode); @@ -1190,7 +1190,7 @@ static int ecryptfs_xattr_get(const struct xattr_handler *handler, } static int ecryptfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 19af229eb7ca..373c3e5747e6 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -428,7 +428,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) if (size < 0) size = 8; put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); - rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode, + rc = __vfs_setxattr(&nop_mnt_idmap, lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); inode_unlock(lower_inode); if (rc) diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 617f3ad2485e..b973a2c03dde 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -70,7 +70,7 @@ bool efivarfs_valid_name(const char *str, int len) return uuid_is_valid(s); } -static int efivarfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode = NULL; @@ -163,7 +163,7 @@ efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } static int -efivarfs_fileattr_set(struct user_namespace *mnt_userns, +efivarfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { unsigned int i_flags = 0; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 07e82e246666..482d612b716b 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -194,6 +194,9 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) struct dentry *root; int err; + if (!efivar_is_available()) + return -EOPNOTSUPP; + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -243,6 +246,9 @@ static void efivarfs_kill_sb(struct super_block *sb) { kill_litter_super(sb); + if (!efivar_is_available()) + return; + /* Remove all entries and destroy */ efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL); } @@ -256,9 +262,6 @@ static struct file_system_type efivarfs_type = { static __init int efivarfs_init(void) { - if (!efivars_kobject()) - return -ENODEV; - return register_filesystem(&efivarfs_type); } diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 85490370e0ca..704fb59577e0 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -108,3 +108,21 @@ config EROFS_FS_ONDEMAND read support. If unsure, say N. + +config EROFS_FS_PCPU_KTHREAD + bool "EROFS per-cpu decompression kthread workers" + depends on EROFS_FS_ZIP + help + Saying Y here enables per-CPU kthread workers pool to carry out + async decompression for low latencies on some architectures. + + If unsure, say N. + +config EROFS_FS_PCPU_KTHREAD_HIPRI + bool "EROFS high priority per-CPU kthread workers" + depends on EROFS_FS_ZIP && EROFS_FS_PCPU_KTHREAD + help + This permits EROFS to configure per-CPU kthread workers to run + at higher priority. + + If unsure, say N. diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f57f921683d7..e16545849ea7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -74,8 +74,7 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, } static int erofs_map_blocks_flatmode(struct inode *inode, - struct erofs_map_blocks *map, - int flags) + struct erofs_map_blocks *map) { erofs_blk_t nblocks, lastblk; u64 offset = map->m_la; @@ -91,11 +90,8 @@ static int erofs_map_blocks_flatmode(struct inode *inode, map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; map->m_plen = blknr_to_addr(lastblk) - offset; } else if (tailendpacking) { - /* 2 - inode inline B: inode, [xattrs], inline last blk... */ - struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); - - map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(map->m_la); + map->m_pa = erofs_iloc(inode) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(offset); map->m_plen = inode->i_size - offset; /* inline data should be located in the same meta block */ @@ -117,8 +113,7 @@ static int erofs_map_blocks_flatmode(struct inode *inode, return 0; } -int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags) +int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) { struct super_block *sb = inode->i_sb; struct erofs_inode *vi = EROFS_I(inode); @@ -130,7 +125,7 @@ int erofs_map_blocks(struct inode *inode, void *kaddr; int err = 0; - trace_erofs_map_blocks_enter(inode, map, flags); + trace_erofs_map_blocks_enter(inode, map, 0); map->m_deviceid = 0; if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ @@ -140,7 +135,7 @@ int erofs_map_blocks(struct inode *inode, } if (vi->datalayout != EROFS_INODE_CHUNK_BASED) { - err = erofs_map_blocks_flatmode(inode, map, flags); + err = erofs_map_blocks_flatmode(inode, map); goto out; } @@ -150,7 +145,7 @@ int erofs_map_blocks(struct inode *inode, unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */ chunknr = map->m_la >> vi->chunkbits; - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + + pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); @@ -192,7 +187,7 @@ out_unlock: out: if (!err) map->m_llen = map->m_plen; - trace_erofs_map_blocks_exit(inode, map, flags, 0); + trace_erofs_map_blocks_exit(inode, map, 0, err); return err; } @@ -255,7 +250,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_la = offset; map.m_llen = length; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + ret = erofs_map_blocks(inode, &map); if (ret < 0) return ret; @@ -429,7 +424,7 @@ static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; vma->vm_ops = &erofs_dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } #else diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index ecf28f66b97d..6970b09b8307 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -6,21 +6,6 @@ */ #include "internal.h" -static void debug_one_dentry(unsigned char d_type, const char *de_name, - unsigned int de_namelen) -{ -#ifdef CONFIG_EROFS_FS_DEBUG - /* since the on-disk name could not have the trailing '\0' */ - unsigned char dbg_namebuf[EROFS_NAME_LEN + 1]; - - memcpy(dbg_namebuf, de_name, de_namelen); - dbg_namebuf[de_namelen] = '\0'; - - erofs_dbg("found dirent %s de_len %u d_type %d", dbg_namebuf, - de_namelen, d_type); -#endif -} - static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, void *dentry_blk, struct erofs_dirent *de, unsigned int nameoff, unsigned int maxsize) @@ -52,10 +37,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, return -EFSCORRUPTED; } - debug_one_dentry(d_type, de_name, de_namelen); if (!dir_emit(ctx, de_name, de_namelen, le64_to_cpu(de->nid), d_type)) - /* stopped by some reason */ return 1; ++de; ctx->pos += sizeof(struct erofs_dirent); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 014e20962376..96a87c023128 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -9,6 +9,7 @@ static DEFINE_MUTEX(erofs_domain_list_lock); static DEFINE_MUTEX(erofs_domain_cookies_lock); static LIST_HEAD(erofs_domain_list); +static LIST_HEAD(erofs_domain_cookies_list); static struct vfsmount *erofs_pseudo_mnt; struct erofs_fscache_request { @@ -164,18 +165,8 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; - struct super_block *sb = folio_mapping(folio)->host->i_sb; + struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private; struct erofs_fscache_request *req; - struct erofs_map_dev mdev = { - .m_deviceid = 0, - .m_pa = folio_pos(folio), - }; - - ret = erofs_map_dev(sb, &mdev); - if (ret) { - folio_unlock(folio); - return ret; - } req = erofs_fscache_req_alloc(folio_mapping(folio), folio_pos(folio), folio_size(folio)); @@ -184,8 +175,8 @@ static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) return PTR_ERR(req); } - ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - req, mdev.m_pa, folio_size(folio)); + ret = erofs_fscache_read_folios_async(ctx->cookie, req, + folio_pos(folio), folio_size(folio)); if (ret) req->error = ret; @@ -207,7 +198,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) int ret; map.m_la = pos; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + ret = erofs_map_blocks(inode, &map); if (ret) return ret; @@ -328,8 +319,6 @@ const struct address_space_operations erofs_fscache_access_aops = { static void erofs_fscache_domain_put(struct erofs_domain *domain) { - if (!domain) - return; mutex_lock(&erofs_domain_list_lock); if (refcount_dec_and_test(&domain->ref)) { list_del(&domain->list); @@ -337,8 +326,8 @@ static void erofs_fscache_domain_put(struct erofs_domain *domain) kern_unmount(erofs_pseudo_mnt); erofs_pseudo_mnt = NULL; } - mutex_unlock(&erofs_domain_list_lock); fscache_relinquish_volume(domain->volume, NULL, false); + mutex_unlock(&erofs_domain_list_lock); kfree(domain->domain_id); kfree(domain); return; @@ -431,19 +420,21 @@ static int erofs_fscache_register_domain(struct super_block *sb) return err; } -static -struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, + char *name, unsigned int flags) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; struct fscache_cookie *cookie; + struct super_block *isb; + struct inode *inode; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&ctx->node); + refcount_set(&ctx->ref, 1); cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE, name, strlen(name), NULL, 0, 0); @@ -452,32 +443,32 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, ret = -EINVAL; goto err; } - fscache_use_cookie(cookie, false); - ctx->cookie = cookie; - - if (flags & EROFS_REG_COOKIE_NEED_INODE) { - struct inode *const inode = new_inode(sb); - - if (!inode) { - erofs_err(sb, "failed to get anon inode for %s", name); - ret = -ENOMEM; - goto err_cookie; - } - set_nlink(inode, 1); - inode->i_size = OFFSET_MAX; - inode->i_mapping->a_ops = &erofs_fscache_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - - ctx->inode = inode; + /* + * Allocate anonymous inode in global pseudo mount for shareable blobs, + * so that they are accessible among erofs fs instances. + */ + isb = flags & EROFS_REG_COOKIE_SHARE ? erofs_pseudo_mnt->mnt_sb : sb; + inode = new_inode(isb); + if (!inode) { + erofs_err(sb, "failed to get anon inode for %s", name); + ret = -ENOMEM; + goto err_cookie; } + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &erofs_fscache_meta_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + inode->i_private = ctx; + + ctx->cookie = cookie; + ctx->inode = inode; return ctx; err_cookie: - fscache_unuse_cookie(ctx->cookie, NULL, NULL); - fscache_relinquish_cookie(ctx->cookie, false); + fscache_unuse_cookie(cookie, NULL, NULL); + fscache_relinquish_cookie(cookie, false); err: kfree(ctx); return ERR_PTR(ret); @@ -492,13 +483,9 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) kfree(ctx); } -static -struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_domain_init_cookie(struct super_block *sb, + char *name, unsigned int flags) { - int err; - struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; @@ -508,55 +495,38 @@ struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, ctx->name = kstrdup(name, GFP_KERNEL); if (!ctx->name) { - err = -ENOMEM; - goto out; - } - - inode = new_inode(erofs_pseudo_mnt->mnt_sb); - if (!inode) { - err = -ENOMEM; - goto out; + erofs_fscache_relinquish_cookie(ctx); + return ERR_PTR(-ENOMEM); } - ctx->domain = domain; - ctx->anon_inode = inode; - inode->i_private = ctx; refcount_inc(&domain->ref); + ctx->domain = domain; + list_add(&ctx->node, &erofs_domain_cookies_list); return ctx; -out: - erofs_fscache_relinquish_cookie(ctx); - return ERR_PTR(err); } -static -struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, + char *name, unsigned int flags) { - struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; - struct super_block *psb = erofs_pseudo_mnt->mnt_sb; + flags |= EROFS_REG_COOKIE_SHARE; mutex_lock(&erofs_domain_cookies_lock); - spin_lock(&psb->s_inode_list_lock); - list_for_each_entry(inode, &psb->s_inodes, i_sb_list) { - ctx = inode->i_private; - if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) + list_for_each_entry(ctx, &erofs_domain_cookies_list, node) { + if (ctx->domain != domain || strcmp(ctx->name, name)) continue; if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) { - igrab(inode); + refcount_inc(&ctx->ref); } else { erofs_err(sb, "%s already exists in domain %s", name, domain->domain_id); ctx = ERR_PTR(-EEXIST); } - spin_unlock(&psb->s_inode_list_lock); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } - spin_unlock(&psb->s_inode_list_lock); - ctx = erofs_fscache_domain_init_cookie(sb, name, flags); + ctx = erofs_domain_init_cookie(sb, name, flags); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } @@ -572,23 +542,22 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) { - bool drop; - struct erofs_domain *domain; + struct erofs_domain *domain = NULL; if (!ctx) return; - domain = ctx->domain; - if (domain) { - mutex_lock(&erofs_domain_cookies_lock); - drop = atomic_read(&ctx->anon_inode->i_count) == 1; - iput(ctx->anon_inode); - mutex_unlock(&erofs_domain_cookies_lock); - if (!drop) - return; - } + if (!ctx->domain) + return erofs_fscache_relinquish_cookie(ctx); - erofs_fscache_relinquish_cookie(ctx); - erofs_fscache_domain_put(domain); + mutex_lock(&erofs_domain_cookies_lock); + if (refcount_dec_and_test(&ctx->ref)) { + domain = ctx->domain; + list_del(&ctx->node); + erofs_fscache_relinquish_cookie(ctx); + } + mutex_unlock(&erofs_domain_cookies_lock); + if (domain) + erofs_fscache_domain_put(domain); } int erofs_fscache_register_fs(struct super_block *sb) @@ -596,7 +565,7 @@ int erofs_fscache_register_fs(struct super_block *sb) int ret; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; - unsigned int flags; + unsigned int flags = 0; if (sbi->domain_id) ret = erofs_fscache_register_domain(sb); @@ -615,7 +584,6 @@ int erofs_fscache_register_fs(struct super_block *sb) * * Acquired domain/volume will be relinquished in kill_sb() on error. */ - flags = EROFS_REG_COOKIE_NEED_INODE; if (sbi->domain_id) flags |= EROFS_REG_COOKIE_NEED_NOEXIST; fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d3b8736fa124..4be7dda3cd24 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -14,7 +14,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, struct super_block *sb = inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_inode *vi = EROFS_I(inode); - const erofs_off_t inode_loc = iloc(sbi, vi->nid); + const erofs_off_t inode_loc = erofs_iloc(inode); erofs_blk_t blkaddr, nblks = 0; void *kaddr; @@ -308,52 +308,54 @@ out_unlock: } /* - * erofs nid is 64bits, but i_ino is 'unsigned long', therefore - * we should do more for 32-bit platform to find the right inode. + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. */ -static int erofs_ilookup_test_actor(struct inode *inode, void *opaque) +static ino_t erofs_squash_ino(erofs_nid_t nid) { - const erofs_nid_t nid = *(erofs_nid_t *)opaque; + ino_t ino = (ino_t)nid; + + if (sizeof(ino_t) < sizeof(erofs_nid_t)) + ino ^= nid >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8; + return ino; +} - return EROFS_I(inode)->nid == nid; +static int erofs_iget5_eq(struct inode *inode, void *opaque) +{ + return EROFS_I(inode)->nid == *(erofs_nid_t *)opaque; } -static int erofs_iget_set_actor(struct inode *inode, void *opaque) +static int erofs_iget5_set(struct inode *inode, void *opaque) { const erofs_nid_t nid = *(erofs_nid_t *)opaque; - inode->i_ino = erofs_inode_hash(nid); + inode->i_ino = erofs_squash_ino(nid); + EROFS_I(inode)->nid = nid; return 0; } struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) { - const unsigned long hashval = erofs_inode_hash(nid); struct inode *inode; - inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor, - erofs_iget_set_actor, &nid); + inode = iget5_locked(sb, erofs_squash_ino(nid), erofs_iget5_eq, + erofs_iget5_set, &nid); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - int err; - struct erofs_inode *vi = EROFS_I(inode); - - vi->nid = nid; + int err = erofs_fill_inode(inode); - err = erofs_fill_inode(inode); - if (!err) { - unlock_new_inode(inode); - } else { + if (err) { iget_failed(inode); - inode = ERR_PTR(err); + return ERR_PTR(err); } + unlock_new_inode(inode); } return inode; } -int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -366,7 +368,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index bb8501c0ff5b..3f3561d37d1b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -12,7 +12,6 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/bio.h> -#include <linux/buffer_head.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -108,9 +107,12 @@ struct erofs_domain { struct erofs_fscache { struct fscache_cookie *cookie; - struct inode *inode; - struct inode *anon_inode; + struct inode *inode; /* anonymous inode for the blob */ + + /* used for share domain mode */ struct erofs_domain *domain; + struct list_head node; + refcount_t ref; char *name; }; @@ -271,11 +273,6 @@ struct erofs_buf { #define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) #define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) -static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) -{ - return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); -} - #define EROFS_FEATURE_FUNCS(name, compat, feature) \ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ { \ @@ -340,13 +337,14 @@ struct erofs_inode { struct inode vfs_inode; }; -#define EROFS_I(ptr) \ - container_of(ptr, struct erofs_inode, vfs_inode) +#define EROFS_I(ptr) container_of(ptr, struct erofs_inode, vfs_inode) -static inline unsigned long erofs_inode_datablocks(struct inode *inode) +static inline erofs_off_t erofs_iloc(struct inode *inode) { - /* since i_size cannot be changed */ - return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + + return blknr_to_addr(sbi->meta_blkaddr) + + (EROFS_I(inode)->nid << sbi->islotbits); } static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit, @@ -382,31 +380,18 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, readahead_gfp_mask(mapping) & ~__GFP_RECLAIM); } -extern const struct super_operations erofs_sops; -extern struct file_system_type erofs_fs_type; - -extern const struct address_space_operations erofs_raw_access_aops; -extern const struct address_space_operations z_erofs_aops; - -enum { - BH_Encoded = BH_PrivateStart, - BH_FullMapped, - BH_Fragment, - BH_Partialref, -}; - /* Has a disk mapping */ -#define EROFS_MAP_MAPPED (1 << BH_Mapped) +#define EROFS_MAP_MAPPED 0x0001 /* Located in metadata (could be copied from bd_inode) */ -#define EROFS_MAP_META (1 << BH_Meta) +#define EROFS_MAP_META 0x0002 /* The extent is encoded */ -#define EROFS_MAP_ENCODED (1 << BH_Encoded) +#define EROFS_MAP_ENCODED 0x0004 /* The length of extent is full */ -#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) +#define EROFS_MAP_FULL_MAPPED 0x0008 /* Located in the special packed inode */ -#define EROFS_MAP_FRAGMENT (1 << BH_Fragment) +#define EROFS_MAP_FRAGMENT 0x0010 /* The extent refers to partial decompressed data */ -#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref) +#define EROFS_MAP_PARTIAL_REF 0x0020 struct erofs_map_blocks { struct erofs_buf buf; @@ -419,17 +404,15 @@ struct erofs_map_blocks { unsigned int m_flags; }; -/* Flags used by erofs_map_blocks_flatmode() */ -#define EROFS_GET_BLOCKS_RAW 0x0001 /* * Used to get the exact decompressed length, e.g. fiemap (consider lookback * approach instead if possible since it's more metadata lightweight.) */ -#define EROFS_GET_BLOCKS_FIEMAP 0x0002 +#define EROFS_GET_BLOCKS_FIEMAP 0x0001 /* Used to map the whole extent if non-negligible data is requested for LZMA */ -#define EROFS_GET_BLOCKS_READMORE 0x0004 +#define EROFS_GET_BLOCKS_READMORE 0x0002 /* Used to map tail extent for tailpacking inline or fragment pcluster */ -#define EROFS_GET_BLOCKS_FINDTAIL 0x0008 +#define EROFS_GET_BLOCKS_FINDTAIL 0x0004 enum { Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, @@ -437,24 +420,6 @@ enum { Z_EROFS_COMPRESSION_RUNTIME_MAX }; -/* zmap.c */ -extern const struct iomap_ops z_erofs_iomap_report_ops; - -#ifdef CONFIG_EROFS_FS_ZIP -int z_erofs_fill_inode(struct inode *inode); -int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, - int flags); -#else -static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } -static inline int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, - int flags) -{ - return -EOPNOTSUPP; -} -#endif /* !CONFIG_EROFS_FS_ZIP */ - struct erofs_map_dev { struct erofs_fscache *m_fscache; struct block_device *m_bdev; @@ -465,8 +430,27 @@ struct erofs_map_dev { unsigned int m_deviceid; }; -/* data.c */ +extern struct file_system_type erofs_fs_type; +extern const struct super_operations erofs_sops; + +extern const struct address_space_operations erofs_raw_access_aops; +extern const struct address_space_operations z_erofs_aops; +extern const struct address_space_operations erofs_fscache_access_aops; + +extern const struct inode_operations erofs_generic_iops; +extern const struct inode_operations erofs_symlink_iops; +extern const struct inode_operations erofs_fast_symlink_iops; +extern const struct inode_operations erofs_dir_iops; + extern const struct file_operations erofs_file_fops; +extern const struct file_operations erofs_dir_fops; + +extern const struct iomap_ops z_erofs_iomap_report_ops; + +/* flags for erofs_fscache_register_cookie() */ +#define EROFS_REG_COOKIE_SHARE 0x0001 +#define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002 + void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); void *erofs_bread(struct erofs_buf *buf, struct inode *inode, @@ -476,37 +460,14 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); -int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags); - -/* inode.c */ -static inline unsigned long erofs_inode_hash(erofs_nid_t nid) -{ -#if BITS_PER_LONG == 32 - return (nid >> 32) ^ (nid & 0xffffffff); -#else - return nid; -#endif -} - -extern const struct inode_operations erofs_generic_iops; -extern const struct inode_operations erofs_symlink_iops; -extern const struct inode_operations erofs_fast_symlink_iops; - +int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); -int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); - -/* namei.c */ -extern const struct inode_operations erofs_dir_iops; - int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, unsigned int *d_type); -/* dir.c */ -extern const struct file_operations erofs_dir_fops; - static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) { int retried = 0; @@ -522,23 +483,19 @@ static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) return NULL; } -/* pcpubuf.c */ void *erofs_get_pcpubuf(unsigned int requiredpages); void erofs_put_pcpubuf(void *ptr); int erofs_pcpubuf_growsize(unsigned int nrpages); void erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); -/* sysfs.c */ int erofs_register_sysfs(struct super_block *sb); void erofs_unregister_sysfs(struct super_block *sb); int __init erofs_init_sysfs(void); void erofs_exit_sysfs(void); -/* utils.c / zdata.c */ struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); -static inline void erofs_pagepool_add(struct page **pagepool, - struct page *page) +static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) { set_page_private(page, (unsigned long)*pagepool); *pagepool = page; @@ -564,6 +521,9 @@ int erofs_try_to_free_cached_page(struct page *page); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); +int z_erofs_fill_inode(struct inode *inode); +int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, + int flags); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} @@ -581,6 +541,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } return 0; } +static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_ZIP_LZMA @@ -601,23 +562,15 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, } return 0; } -#endif /* !CONFIG_EROFS_FS_ZIP */ +#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */ -/* flags for erofs_fscache_register_cookie() */ -#define EROFS_REG_COOKIE_NEED_INODE 1 -#define EROFS_REG_COOKIE_NEED_NOEXIST 2 - -/* fscache.c */ #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, - unsigned int flags); + char *name, unsigned int flags); void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); - -extern const struct address_space_operations erofs_fscache_access_aops; #else static inline int erofs_fscache_register_fs(struct super_block *sb) { @@ -627,8 +580,7 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} static inline struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, - unsigned int flags) + char *name, unsigned int flags) { return ERR_PTR(-EOPNOTSUPP); } diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index b64a108fac92..966eabc61c13 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -5,7 +5,6 @@ * Copyright (C) 2022, Alibaba Cloud */ #include "xattr.h" - #include <trace/events/erofs.h> struct erofs_qstr { @@ -87,19 +86,13 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, return ERR_PTR(-ENOENT); } -static void *find_target_block_classic(struct erofs_buf *target, - struct inode *dir, - struct erofs_qstr *name, - int *_ndirents) +static void *erofs_find_target_block(struct erofs_buf *target, + struct inode *dir, struct erofs_qstr *name, int *_ndirents) { - unsigned int startprfx, endprfx; - int head, back; + int head = 0, back = DIV_ROUND_UP(dir->i_size, EROFS_BLKSIZ) - 1; + unsigned int startprfx = 0, endprfx = 0; void *candidate = ERR_PTR(-ENOENT); - startprfx = endprfx = 0; - head = 0; - back = erofs_inode_datablocks(dir) - 1; - while (head <= back) { const int mid = head + (back - head) / 2; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; @@ -180,8 +173,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, qn.end = name->name + name->len; ndirents = 0; - - de = find_target_block_classic(&buf, dir, &qn, &ndirents); + de = erofs_find_target_block(&buf, dir, &qn, &ndirents); if (IS_ERR(de)) return PTR_ERR(de); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 626a615dafc2..19b1ae79cec4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -5,7 +5,6 @@ * Copyright (C) 2021, Alibaba Cloud */ #include <linux/module.h> -#include <linux/buffer_head.h> #include <linux/statfs.h> #include <linux/parser.h> #include <linux/seq_file.h> @@ -969,6 +968,8 @@ static void erofs_put_super(struct super_block *sb) iput(sbi->packed_inode); sbi->packed_inode = NULL; #endif + erofs_free_dev_context(sbi->devs); + sbi->devs = NULL; erofs_fscache_unregister_fs(sb); } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index fd476961f742..435e515c0792 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -179,13 +179,13 @@ static const struct sysfs_ops erofs_attr_ops = { .store = erofs_attr_store, }; -static struct kobj_type erofs_sb_ktype = { +static const struct kobj_type erofs_sb_ktype = { .default_groups = erofs_groups, .sysfs_ops = &erofs_attr_ops, .release = erofs_sb_release, }; -static struct kobj_type erofs_ktype = { +static const struct kobj_type erofs_ktype = { .sysfs_ops = &erofs_attr_ops, }; @@ -193,7 +193,7 @@ static struct kset erofs_root = { .kobj = {.ktype = &erofs_ktype}, }; -static struct kobj_type erofs_feat_ktype = { +static const struct kobj_type erofs_feat_ktype = { .default_groups = erofs_feat_groups, .sysfs_ops = &erofs_attr_ops, }; diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h deleted file mode 100644 index 64ceb7270b5c..000000000000 --- a/fs/erofs/tagptr.h +++ /dev/null @@ -1,107 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * A tagged pointer implementation - */ -#ifndef __EROFS_FS_TAGPTR_H -#define __EROFS_FS_TAGPTR_H - -#include <linux/types.h> -#include <linux/build_bug.h> - -/* - * the name of tagged pointer types are tagptr{1, 2, 3...}_t - * avoid directly using the internal structs __tagptr{1, 2, 3...} - */ -#define __MAKE_TAGPTR(n) \ -typedef struct __tagptr##n { \ - uintptr_t v; \ -} tagptr##n##_t; - -__MAKE_TAGPTR(1) -__MAKE_TAGPTR(2) -__MAKE_TAGPTR(3) -__MAKE_TAGPTR(4) - -#undef __MAKE_TAGPTR - -extern void __compiletime_error("bad tagptr tags") - __bad_tagptr_tags(void); - -extern void __compiletime_error("bad tagptr type") - __bad_tagptr_type(void); - -/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */ -#define __tagptr_mask_1(ptr, n) \ - __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \ - (1UL << (n)) - 1 : - -#define __tagptr_mask(ptr) (\ - __tagptr_mask_1(ptr, 1) ( \ - __tagptr_mask_1(ptr, 2) ( \ - __tagptr_mask_1(ptr, 3) ( \ - __tagptr_mask_1(ptr, 4) ( \ - __bad_tagptr_type(), 0))))) - -/* generate a tagged pointer from a raw value */ -#define tagptr_init(type, val) \ - ((typeof(type)){ .v = (uintptr_t)(val) }) - -/* - * directly cast a tagged pointer to the native pointer type, which - * could be used for backward compatibility of existing code. - */ -#define tagptr_cast_ptr(tptr) ((void *)(tptr).v) - -/* encode tagged pointers */ -#define tagptr_fold(type, ptr, _tags) ({ \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \ - __bad_tagptr_tags(); \ -tagptr_init(type, (uintptr_t)(ptr) | tags); }) - -/* decode tagged pointers */ -#define tagptr_unfold_ptr(tptr) \ - ((void *)((tptr).v & ~__tagptr_mask(tptr))) - -#define tagptr_unfold_tags(tptr) \ - ((tptr).v & __tagptr_mask(tptr)) - -/* operations for the tagger pointer */ -#define tagptr_eq(_tptr1, _tptr2) ({ \ - typeof(_tptr1) tptr1 = (_tptr1); \ - typeof(_tptr2) tptr2 = (_tptr2); \ - (void)(&tptr1 == &tptr2); \ -(tptr1).v == (tptr2).v; }) - -/* lock-free CAS operation */ -#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - typeof(_o) o = (_o); \ - typeof(_n) n = (_n); \ - (void)(&o == &n); \ - (void)(&o == ptptr); \ -tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) - -/* wrap WRITE_ONCE if atomic update is needed */ -#define tagptr_replace_tags(_ptptr, tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \ -*ptptr; }) - -#define tagptr_set_tags(_ptptr, _tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ - __bad_tagptr_tags(); \ - ptptr->v |= tags; \ -*ptptr; }) - -#define tagptr_clear_tags(_ptptr, _tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ - __bad_tagptr_tags(); \ - ptptr->v &= ~tags; \ -*ptptr; }) - -#endif /* __EROFS_FS_TAGPTR_H */ diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index a62fb8a3318a..60729b1220b6 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -22,8 +22,7 @@ static int init_inode_xattrs(struct inode *inode) struct xattr_iter it; unsigned int i; struct erofs_xattr_ibody_header *ih; - struct super_block *sb; - struct erofs_sb_info *sbi; + struct super_block *sb = inode->i_sb; int ret = 0; /* the most case is that xattrs of this inode are initialized. */ @@ -52,15 +51,14 @@ static int init_inode_xattrs(struct inode *inode) * undefined right now (maybe use later with some new sb feature). */ if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { - erofs_err(inode->i_sb, + erofs_err(sb, "xattr_isize %d of nid %llu is not supported yet", vi->xattr_isize, vi->nid); ret = -EOPNOTSUPP; goto out_unlock; } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { if (vi->xattr_isize) { - erofs_err(inode->i_sb, - "bogus xattr ibody @ nid %llu", vi->nid); + erofs_err(sb, "bogus xattr ibody @ nid %llu", vi->nid); DBG_BUGON(1); ret = -EFSCORRUPTED; goto out_unlock; /* xattr ondisk layout error */ @@ -69,11 +67,9 @@ static int init_inode_xattrs(struct inode *inode) goto out_unlock; } - sb = inode->i_sb; - sbi = EROFS_SB(sb); it.buf = __EROFS_BUF_INITIALIZER; - it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); - it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); + it.blkaddr = erofs_blknr(erofs_iloc(inode) + vi->inode_isize); + it.ofs = erofs_blkoff(erofs_iloc(inode) + vi->inode_isize); /* read in shared xattr array (non-atomic, see kmalloc below) */ it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP); @@ -159,7 +155,6 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); - struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb); unsigned int xattr_header_sz, inline_xattr_ofs; xattr_header_sz = inlinexattr_header_size(inode); @@ -170,9 +165,8 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, inline_xattr_ofs = vi->inode_isize + xattr_header_sz; - it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); - it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); - + it->blkaddr = erofs_blknr(erofs_iloc(inode) + inline_xattr_ofs); + it->ofs = erofs_blkoff(erofs_iloc(inode) + inline_xattr_ofs); it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, EROFS_KMAP); if (IS_ERR(it->kaddr)) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5200bb86e264..3247d2422bea 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -4,13 +4,178 @@ * https://www.huawei.com/ * Copyright (C) 2022 Alibaba Cloud */ -#include "zdata.h" #include "compress.h" #include <linux/prefetch.h> #include <linux/psi.h> - +#include <linux/cpuhotplug.h> #include <trace/events/erofs.h> +#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) +#define Z_EROFS_INLINE_BVECS 2 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only + * for everyone else; + * + * L: Field should be protected by the pcluster lock; + * + * A: Field should be accessed / updated in atomic for parallelized code. + */ +struct z_erofs_pcluster { + struct erofs_workgroup obj; + struct mutex lock; + + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* L: the maximum decompression size of this round */ + unsigned int length; + + /* L: total number of bvecs */ + unsigned int vcnt; + + /* I: page offset of start position of decompression */ + unsigned short pageofs_out; + + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; + + union { + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; + + /* I: can be used to free the pcluster by RCU. */ + struct rcu_head rcu; + }; + + union { + /* I: physical cluster size in pages */ + unsigned short pclusterpages; + + /* I: tailpacking inline compressed size */ + unsigned short tailpacking_size; + }; + + /* I: compression algorithm format */ + unsigned char algorithmformat; + + /* L: whether partial decompression or not */ + bool partial; + + /* L: indicate several pageofs_outs or not */ + bool multibases; + + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; +}; + +/* let's avoid the valid 32-bit kernel addresses */ + +/* the chained workgroup has't submitted io (still open) */ +#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) +/* the chained workgroup has already submitted io */ +#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) + +#define Z_EROFS_PCLUSTER_NIL (NULL) + +struct z_erofs_decompressqueue { + struct super_block *sb; + atomic_t pending_bios; + z_erofs_next_pcluster_t head; + + union { + struct completion done; + struct work_struct work; + struct kthread_work kthread_work; + } u; + bool eio, sync; +}; + +static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) +{ + return !pcl->obj.index; +} + +static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) +{ + if (z_erofs_is_inline_pcluster(pcl)) + return 1; + return pcl->pclusterpages; +} + +/* + * bit 30: I/O error occurred on this page + * bit 0 - 29: remaining parts to complete this page + */ +#define Z_EROFS_PAGE_EIO (1 << 30) + +static inline void z_erofs_onlinepage_init(struct page *page) +{ + union { + atomic_t o; + unsigned long v; + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +static inline void z_erofs_onlinepage_split(struct page *page) +{ + atomic_inc((atomic_t *)&page->private); +} + +static inline void z_erofs_page_mark_eio(struct page *page) +{ + int orig; + + do { + orig = atomic_read((atomic_t *)&page->private); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, + orig | Z_EROFS_PAGE_EIO) != orig); +} + +static inline void z_erofs_onlinepage_endio(struct page *page) +{ + unsigned int v; + + DBG_BUGON(!PagePrivate(page)); + v = atomic_dec_return((atomic_t *)&page->private); + if (!(v & ~Z_EROFS_PAGE_EIO)) { + set_page_private(page, 0); + ClearPagePrivate(page); + if (!(v & Z_EROFS_PAGE_EIO)) + SetPageUptodate(page); + unlock_page(page); + } +} + +#define Z_EROFS_ONSTACK_PAGES 32 + /* * since pclustersize is variable for big pcluster feature, introduce slab * pools implementation for different pcluster sizes. @@ -175,35 +340,130 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* - * tagged pointer with 1-bit tag for all compressed pages - * tag 0 - the page is just found with an extra page reference - */ -typedef tagptr1_t compressed_page_t; +static struct workqueue_struct *z_erofs_workqueue __read_mostly; -#define tag_compressed_page_justfound(page) \ - tagptr_fold(compressed_page_t, page, 1) +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static struct kthread_worker __rcu **z_erofs_pcpu_workers; -static struct workqueue_struct *z_erofs_workqueue __read_mostly; +static void erofs_destroy_percpu_workers(void) +{ + struct kthread_worker *worker; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + worker = rcu_dereference_protected( + z_erofs_pcpu_workers[cpu], 1); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + if (worker) + kthread_destroy_worker(worker); + } + kfree(z_erofs_pcpu_workers); +} -void z_erofs_exit_zip_subsystem(void) +static struct kthread_worker *erofs_init_percpu_worker(int cpu) { - destroy_workqueue(z_erofs_workqueue); - z_erofs_destroy_pcluster_pool(); + struct kthread_worker *worker = + kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu); + + if (IS_ERR(worker)) + return worker; + if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI)) + sched_set_fifo_low(worker->task); + else + sched_set_normal(worker->task, 0); + return worker; } -static inline int z_erofs_init_workqueue(void) +static int erofs_init_percpu_workers(void) { - const unsigned int onlinecpus = num_possible_cpus(); + struct kthread_worker *worker; + unsigned int cpu; - /* - * no need to spawn too many threads, limiting threads could minimum - * scheduling overhead, perhaps per-CPU threads should be better? - */ - z_erofs_workqueue = alloc_workqueue("erofs_unzipd", - WQ_UNBOUND | WQ_HIGHPRI, - onlinecpus + onlinecpus / 4); - return z_erofs_workqueue ? 0 : -ENOMEM; + z_erofs_pcpu_workers = kcalloc(num_possible_cpus(), + sizeof(struct kthread_worker *), GFP_ATOMIC); + if (!z_erofs_pcpu_workers) + return -ENOMEM; + + for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */ + worker = erofs_init_percpu_worker(cpu); + if (!IS_ERR(worker)) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + } + return 0; +} +#else +static inline void erofs_destroy_percpu_workers(void) {} +static inline int erofs_init_percpu_workers(void) { return 0; } +#endif + +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) +static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); +static enum cpuhp_state erofs_cpuhp_state; + +static int erofs_cpu_online(unsigned int cpu) +{ + struct kthread_worker *worker, *old; + + worker = erofs_init_percpu_worker(cpu); + if (IS_ERR(worker)) + return PTR_ERR(worker); + + spin_lock(&z_erofs_pcpu_worker_lock); + old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + if (!old) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + spin_unlock(&z_erofs_pcpu_worker_lock); + if (old) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_offline(unsigned int cpu) +{ + struct kthread_worker *worker; + + spin_lock(&z_erofs_pcpu_worker_lock); + worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + spin_unlock(&z_erofs_pcpu_worker_lock); + + synchronize_rcu(); + if (worker) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_hotplug_init(void) +{ + int state; + + state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline); + if (state < 0) + return state; + + erofs_cpuhp_state = state; + return 0; +} + +static void erofs_cpu_hotplug_destroy(void) +{ + if (erofs_cpuhp_state) + cpuhp_remove_state_nocalls(erofs_cpuhp_state); +} +#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ +static inline int erofs_cpu_hotplug_init(void) { return 0; } +static inline void erofs_cpu_hotplug_destroy(void) {} +#endif + +void z_erofs_exit_zip_subsystem(void) +{ + erofs_cpu_hotplug_destroy(); + erofs_destroy_percpu_workers(); + destroy_workqueue(z_erofs_workqueue); + z_erofs_destroy_pcluster_pool(); } int __init z_erofs_init_zip_subsystem(void) @@ -211,10 +471,31 @@ int __init z_erofs_init_zip_subsystem(void) int err = z_erofs_create_pcluster_pool(); if (err) - return err; - err = z_erofs_init_workqueue(); + goto out_error_pcluster_pool; + + z_erofs_workqueue = alloc_workqueue("erofs_worker", + WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); + if (!z_erofs_workqueue) { + err = -ENOMEM; + goto out_error_workqueue_init; + } + + err = erofs_init_percpu_workers(); if (err) - z_erofs_destroy_pcluster_pool(); + goto out_error_pcpu_worker; + + err = erofs_cpu_hotplug_init(); + if (err < 0) + goto out_error_cpuhp_init; + return err; + +out_error_cpuhp_init: + erofs_destroy_percpu_workers(); +out_error_pcpu_worker: + destroy_workqueue(z_erofs_workqueue); +out_error_workqueue_init: + z_erofs_destroy_pcluster_pool(); +out_error_pcluster_pool: return err; } @@ -319,7 +600,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; - compressed_page_t t; + void *t; /* mark pages just found for debugging */ struct page *newpage = NULL; /* the compressed page was loaded before */ @@ -329,7 +610,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, page = find_get_page(mc, pcl->obj.index + i); if (page) { - t = tag_compressed_page_justfound(page); + t = (void *)((unsigned long)page | 1); } else { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -345,11 +626,10 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); - t = tag_compressed_page_justfound(newpage); + t = (void *)((unsigned long)newpage | 1); } - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, - tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) continue; if (page) @@ -1151,18 +1431,24 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); z_erofs_decompress_queue(bgq, &pagepool); - erofs_release_pages(&pagepool); kvfree(bgq); } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) +{ + z_erofs_decompressqueue_work((struct work_struct *)work); +} +#endif + static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, - bool sync, int bios) + int bios) { struct erofs_sb_info *const sbi = EROFS_SB(io->sb); /* wake up the caller thread for sync decompression */ - if (sync) { + if (io->sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); return; @@ -1170,9 +1456,24 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (atomic_add_return(bios, &io->pending_bios)) return; - /* Use workqueue and sync decompression for atomic contexts only */ + /* Use (kthread_)work and sync decompression for atomic contexts only */ if (in_atomic() || irqs_disabled()) { +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + struct kthread_worker *worker; + + rcu_read_lock(); + worker = rcu_dereference( + z_erofs_pcpu_workers[raw_smp_processor_id()]); + if (!worker) { + INIT_WORK(&io->u.work, z_erofs_decompressqueue_work); + queue_work(z_erofs_workqueue, &io->u.work); + } else { + kthread_queue_work(worker, &io->u.kthread_work); + } + rcu_read_unlock(); +#else queue_work(z_erofs_workqueue, &io->u.work); +#endif /* enable sync decompression for readahead */ if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; @@ -1192,8 +1493,6 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, struct address_space *mapping; struct page *oldpage, *page; - - compressed_page_t t; int justfound; repeat: @@ -1203,10 +1502,8 @@ repeat: if (!page) goto out_allocpage; - /* process the target tagged pointer */ - t = tagptr_init(compressed_page_t, page); - justfound = tagptr_unfold_tags(t); - page = tagptr_unfold_ptr(t); + justfound = (unsigned long)page & 1UL; + page = (struct page *)((unsigned long)page & ~1UL); /* * preallocated cached pages, which is used to avoid direct reclaim @@ -1294,9 +1591,8 @@ out: /* the only exit (for tracing and debugging) */ return page; } -static struct z_erofs_decompressqueue * -jobqueue_init(struct super_block *sb, - struct z_erofs_decompressqueue *fgq, bool *fg) +static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, + struct z_erofs_decompressqueue *fgq, bool *fg) { struct z_erofs_decompressqueue *q; @@ -1306,13 +1602,19 @@ jobqueue_init(struct super_block *sb, *fg = true; goto fg_out; } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + kthread_init_work(&q->u.kthread_work, + z_erofs_decompressqueue_kthread_work); +#else INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); +#endif } else { fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); q->eio = false; + q->sync = true; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1326,20 +1628,6 @@ enum { NR_JOBQUEUES, }; -static void *jobqueueset_init(struct super_block *sb, - struct z_erofs_decompressqueue *q[], - struct z_erofs_decompressqueue *fgq, bool *fg) -{ - /* - * if managed cache is enabled, bypass jobqueue is needed, - * no need to read from device for all pclusters in this queue. - */ - q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); - q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg); - - return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg)); -} - static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, z_erofs_next_pcluster_t qtail[], z_erofs_next_pcluster_t owned_head) @@ -1361,8 +1649,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, static void z_erofs_decompressqueue_endio(struct bio *bio) { - tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); - struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); + struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; struct bio_vec *bvec; struct bvec_iter_all iter_all; @@ -1381,7 +1668,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) } if (err) q->eio = true; - z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); + z_erofs_decompress_kickoff(q, -1); bio_put(bio); } @@ -1394,7 +1681,6 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; - void *bi_private; z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ pgoff_t last_index; @@ -1404,7 +1690,13 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, unsigned long pflags; int memstall = 0; - bi_private = jobqueueset_init(sb, q, fgq, force_fg); + /* + * if managed cache is enabled, bypass jobqueue is needed, + * no need to read from device for all pclusters in this queue. + */ + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; @@ -1473,7 +1765,7 @@ submit_bio_retry: last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; - bio->bi_private = bi_private; + bio->bi_private = q[JQ_SUBMIT]; if (f->readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; @@ -1500,13 +1792,13 @@ submit_bio_retry: /* * although background is preferred, no one is pending for submission. - * don't issue workqueue for decompression but drop it directly instead. + * don't issue decompression but drop it directly instead. */ if (!*force_fg && !nr_bios) { kvfree(q[JQ_SUBMIT]); return; } - z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); + z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h deleted file mode 100644 index d98c95212985..000000000000 --- a/fs/erofs/zdata.h +++ /dev/null @@ -1,178 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#ifndef __EROFS_FS_ZDATA_H -#define __EROFS_FS_ZDATA_H - -#include "internal.h" -#include "tagptr.h" - -#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) -#define Z_EROFS_INLINE_BVECS 2 - -/* - * let's leave a type here in case of introducing - * another tagged pointer later. - */ -typedef void *z_erofs_next_pcluster_t; - -struct z_erofs_bvec { - struct page *page; - int offset; - unsigned int end; -}; - -#define __Z_EROFS_BVSET(name, total) \ -struct name { \ - /* point to the next page which contains the following bvecs */ \ - struct page *nextpage; \ - struct z_erofs_bvec bvec[total]; \ -} -__Z_EROFS_BVSET(z_erofs_bvset,); -__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); - -/* - * Structure fields follow one of the following exclusion rules. - * - * I: Modifiable by initialization/destruction paths and read-only - * for everyone else; - * - * L: Field should be protected by the pcluster lock; - * - * A: Field should be accessed / updated in atomic for parallelized code. - */ -struct z_erofs_pcluster { - struct erofs_workgroup obj; - struct mutex lock; - - /* A: point to next chained pcluster or TAILs */ - z_erofs_next_pcluster_t next; - - /* L: the maximum decompression size of this round */ - unsigned int length; - - /* L: total number of bvecs */ - unsigned int vcnt; - - /* I: page offset of start position of decompression */ - unsigned short pageofs_out; - - /* I: page offset of inline compressed data */ - unsigned short pageofs_in; - - union { - /* L: inline a certain number of bvec for bootstrap */ - struct z_erofs_bvset_inline bvset; - - /* I: can be used to free the pcluster by RCU. */ - struct rcu_head rcu; - }; - - union { - /* I: physical cluster size in pages */ - unsigned short pclusterpages; - - /* I: tailpacking inline compressed size */ - unsigned short tailpacking_size; - }; - - /* I: compression algorithm format */ - unsigned char algorithmformat; - - /* L: whether partial decompression or not */ - bool partial; - - /* L: indicate several pageofs_outs or not */ - bool multibases; - - /* A: compressed bvecs (can be cached or inplaced pages) */ - struct z_erofs_bvec compressed_bvecs[]; -}; - -/* let's avoid the valid 32-bit kernel addresses */ - -/* the chained workgroup has't submitted io (still open) */ -#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) -/* the chained workgroup has already submitted io */ -#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) - -#define Z_EROFS_PCLUSTER_NIL (NULL) - -struct z_erofs_decompressqueue { - struct super_block *sb; - atomic_t pending_bios; - z_erofs_next_pcluster_t head; - - union { - struct completion done; - struct work_struct work; - } u; - - bool eio; -}; - -static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) -{ - return !pcl->obj.index; -} - -static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) -{ - if (z_erofs_is_inline_pcluster(pcl)) - return 1; - return pcl->pclusterpages; -} - -/* - * bit 30: I/O error occurred on this page - * bit 0 - 29: remaining parts to complete this page - */ -#define Z_EROFS_PAGE_EIO (1 << 30) - -static inline void z_erofs_onlinepage_init(struct page *page) -{ - union { - atomic_t o; - unsigned long v; - } u = { .o = ATOMIC_INIT(1) }; - - set_page_private(page, u.v); - smp_wmb(); - SetPagePrivate(page); -} - -static inline void z_erofs_onlinepage_split(struct page *page) -{ - atomic_inc((atomic_t *)&page->private); -} - -static inline void z_erofs_page_mark_eio(struct page *page) -{ - int orig; - - do { - orig = atomic_read((atomic_t *)&page->private); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, - orig | Z_EROFS_PAGE_EIO) != orig); -} - -static inline void z_erofs_onlinepage_endio(struct page *page) -{ - unsigned int v; - - DBG_BUGON(!PagePrivate(page)); - v = atomic_dec_return((atomic_t *)&page->private); - if (!(v & ~Z_EROFS_PAGE_EIO)) { - set_page_private(page, 0); - ClearPagePrivate(page); - if (!(v & Z_EROFS_PAGE_EIO)) - SetPageUptodate(page); - unlock_page(page); - } -} - -#define Z_EROFS_ONSTACK_PAGES 32 - -#endif diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 98fb90b9af71..8bf6d30518b6 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -7,10 +7,6 @@ #include <asm/unaligned.h> #include <trace/events/erofs.h> -static int z_erofs_do_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, - int flags); - int z_erofs_fill_inode(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); @@ -29,126 +25,6 @@ int z_erofs_fill_inode(struct inode *inode) return 0; } -static int z_erofs_fill_inode_lazy(struct inode *inode) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = inode->i_sb; - int err, headnr; - erofs_off_t pos; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - void *kaddr; - struct z_erofs_map_header *h; - - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { - /* - * paired with smp_mb() at the end of the function to ensure - * fields will only be observed after the bit is set. - */ - smp_mb(); - return 0; - } - - if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) - return -ERESTARTSYS; - - err = 0; - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) - goto out_unlock; - - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + - vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); - if (IS_ERR(kaddr)) { - err = PTR_ERR(kaddr); - goto out_unlock; - } - - h = kaddr + erofs_blkoff(pos); - /* - * if the highest bit of the 8-byte map header is set, the whole file - * is stored in the packed inode. The rest bits keeps z_fragmentoff. - */ - if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { - vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; - vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); - vi->z_tailextent_headlcn = 0; - goto done; - } - vi->z_advise = le16_to_cpu(h->h_advise); - vi->z_algorithmtype[0] = h->h_algorithmtype & 15; - vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; - - headnr = 0; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || - vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", - headnr + 1, vi->z_algorithmtype[headnr], vi->nid); - err = -EOPNOTSUPP; - goto out_put_metabuf; - } - - vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); - if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && - vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | - Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { - erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto out_put_metabuf; - } - if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { - erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_idata_size = le16_to_cpu(h->h_idata_size); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - - if (!map.m_plen || - erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { - erofs_err(sb, "invalid tail-packing pclustersize %llu", - map.m_plen); - err = -EFSCORRUPTED; - } - if (err < 0) - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && - !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - if (err < 0) - goto out_put_metabuf; - } -done: - /* paired with smp_mb() at the beginning of the function */ - smp_mb(); - set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); -out_put_metabuf: - erofs_put_metabuf(&buf); -out_unlock: - clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); - return err; -} - struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; @@ -169,10 +45,9 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid); const erofs_off_t pos = - Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize + - vi->xattr_isize) + + Z_EROFS_VLE_LEGACY_INDEX_ALIGN(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize) + lcn * sizeof(struct z_erofs_vle_decompressed_index); struct z_erofs_vle_decompressed_index *di; unsigned int advise, type; @@ -372,9 +247,8 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; - const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) + - vi->inode_isize + vi->xattr_isize, 8) + - sizeof(struct z_erofs_map_header); + const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); unsigned int compacted_4b_initial, compacted_2b; unsigned int amortizedshift; @@ -732,6 +606,125 @@ unmap_out: return err; } +static int z_erofs_fill_inode_lazy(struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = inode->i_sb; + int err, headnr; + erofs_off_t pos; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *kaddr; + struct z_erofs_map_header *h; + + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); + return 0; + } + + if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; + + err = 0; + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + goto out_unlock; + + pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(kaddr)) { + err = PTR_ERR(kaddr); + goto out_unlock; + } + + h = kaddr + erofs_blkoff(pos); + /* + * if the highest bit of the 8-byte map header is set, the whole file + * is stored in the packed inode. The rest bits keeps z_fragmentoff. + */ + if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { + vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); + vi->z_tailextent_headlcn = 0; + goto done; + } + vi->z_advise = le16_to_cpu(h->h_advise); + vi->z_algorithmtype[0] = h->h_algorithmtype & 15; + vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + + headnr = 0; + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || + vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { + erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", + headnr + 1, vi->z_algorithmtype[headnr], vi->nid); + err = -EOPNOTSUPP; + goto out_put_metabuf; + } + + vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); + if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && + vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto out_put_metabuf; + } + if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto out_put_metabuf; + } + + if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_idata_size = le16_to_cpu(h->h_idata_size); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + + if (!map.m_plen || + erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { + erofs_err(sb, "invalid tail-packing pclustersize %llu", + map.m_plen); + err = -EFSCORRUPTED; + } + if (err < 0) + goto out_put_metabuf; + } + + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && + !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + if (err < 0) + goto out_put_metabuf; + } +done: + /* paired with smp_mb() at the beginning of the function */ + smp_mb(); + set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); +out_put_metabuf: + erofs_put_metabuf(&buf); +out_unlock: + clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); + return err; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { diff --git a/fs/exec.c b/fs/exec.c index ab913243a367..7c44d0c65b1b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -270,7 +270,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; + vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); @@ -699,7 +699,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) + if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* @@ -731,12 +731,9 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) } tlb_finish_mmu(&tlb); - /* - * Shrink the vma to just the new range. Always succeeds. - */ - vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); - - return 0; + vma_prev(&vmi); + /* Shrink the vma to just the new range */ + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } /* @@ -758,6 +755,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_expand; unsigned long rlim_stack; struct mmu_gather tlb; + struct vma_iterator vmi; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ @@ -812,8 +810,10 @@ int setup_arg_pages(struct linux_binprm *bprm, vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; + vma_iter_init(&vmi, mm, vma->vm_start); + tlb_gather_mmu(&tlb, mm); - ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end, + ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); tlb_finish_mmu(&tlb); @@ -834,7 +834,7 @@ int setup_arg_pages(struct linux_binprm *bprm, } /* mprotect_fixup is overkill to remove the temporary stack flags */ - vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; @@ -1010,6 +1010,7 @@ static int exec_mmap(struct mm_struct *mm) active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; + mm_init_cid(mm); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for @@ -1414,15 +1415,15 @@ EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); - struct user_namespace *mnt_userns = file_mnt_user_ns(file); - if (inode_permission(mnt_userns, inode, MAY_READ) < 0) { + struct mnt_idmap *idmap = file_mnt_idmap(file); + if (inode_permission(idmap, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && - !privileged_wrt_inode_uidgid(user_ns, mnt_userns, inode)) + !privileged_wrt_inode_uidgid(user_ns, idmap, inode)) user_ns = user_ns->parent; if (old != user_ns) { @@ -1596,7 +1597,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode = file_inode(file); unsigned int mode; vfsuid_t vfsuid; @@ -1612,15 +1613,15 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) if (!(mode & (S_ISUID|S_ISGID))) return; - mnt_userns = file_mnt_user_ns(file); + idmap = file_mnt_idmap(file); /* Be careful if suid/sgid is set */ inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ @@ -1822,6 +1823,7 @@ static int bprm_execve(struct linux_binprm *bprm, */ check_unsafe_exec(bprm); current->in_execve = 1; + sched_mm_cid_before_execve(current); file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); @@ -1852,6 +1854,7 @@ static int bprm_execve(struct linux_binprm *bprm, if (retval < 0) goto out; + sched_mm_cid_after_execve(current); /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; @@ -1871,6 +1874,7 @@ out: force_fatal_sig(SIGSEGV); out_unmark: + sched_mm_cid_after_execve(current); current->fs->in_exec = 0; current->in_execve = 0; diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig index 5a65071b5ecf..147edeb04469 100644 --- a/fs/exfat/Kconfig +++ b/fs/exfat/Kconfig @@ -3,6 +3,7 @@ config EXFAT_FS tristate "exFAT filesystem support" select NLS + select LEGACY_DIRECT_IO help This allows you to mount devices formatted with the exFAT file system. exFAT is typically used on SD-Cards or USB sticks. diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 1dfa67f307f1..957574180a5e 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -29,14 +29,15 @@ static int exfat_extract_uni_name(struct exfat_dentry *ep, } -static void exfat_get_uniname_from_ext_entry(struct super_block *sb, +static int exfat_get_uniname_from_ext_entry(struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned short *uniname) { - int i; + int i, err; struct exfat_entry_set_cache es; - if (exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES)) - return; + err = exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES); + if (err) + return err; /* * First entry : file entry @@ -56,12 +57,13 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, } exfat_put_dentry_set(&es, false); + return 0; } /* read a directory entry from the opened directory */ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry) { - int i, dentries_per_clu, num_ext; + int i, dentries_per_clu, num_ext, err; unsigned int type, clu_offset, max_dentries; struct exfat_chain dir, clu; struct exfat_uni_name uni_name; @@ -100,7 +102,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent clu.dir = ei->hint_bmap.clu; } - while (clu_offset > 0) { + while (clu_offset > 0 && clu.dir != EXFAT_EOF_CLUSTER) { if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; @@ -146,8 +148,12 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent 0); *uni_name.name = 0x0; - exfat_get_uniname_from_ext_entry(sb, &clu, i, + err = exfat_get_uniname_from_ext_entry(sb, &clu, i, uni_name.name); + if (err) { + brelse(bh); + continue; + } exfat_utf16_to_nls(sb, &uni_name, dir_entry->namebuf.lfn, dir_entry->namebuf.lfnbuf_len); @@ -234,10 +240,7 @@ static int exfat_iterate(struct file *file, struct dir_context *ctx) fake_offset = 1; } - if (cpos & (DENTRY_SIZE - 1)) { - err = -ENOENT; - goto unlock; - } + cpos = round_up(cpos, DENTRY_SIZE); /* name buffer should be allocated before use */ err = exfat_alloc_namebuf(nb); @@ -378,6 +381,12 @@ unsigned int exfat_get_entry_type(struct exfat_dentry *ep) return TYPE_ACL; return TYPE_CRITICAL_SEC; } + + if (ep->type == EXFAT_VENDOR_EXT) + return TYPE_VENDOR_EXT; + if (ep->type == EXFAT_VENDOR_ALLOC) + return TYPE_VENDOR_ALLOC; + return TYPE_BENIGN_SEC; } @@ -521,6 +530,25 @@ release_fbh: return ret; } +static void exfat_free_benign_secondary_clusters(struct inode *inode, + struct exfat_dentry *ep) +{ + struct super_block *sb = inode->i_sb; + struct exfat_chain dir; + unsigned int start_clu = + le32_to_cpu(ep->dentry.generic_secondary.start_clu); + u64 size = le64_to_cpu(ep->dentry.generic_secondary.size); + unsigned char flags = ep->dentry.generic_secondary.flags; + + if (!(flags & ALLOC_POSSIBLE) || !start_clu || !size) + return; + + exfat_chain_set(&dir, start_clu, + EXFAT_B_TO_CLU_ROUND_UP(size, EXFAT_SB(sb)), + flags); + exfat_free_cluster(inode, &dir); +} + int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, int entry, int num_entries, struct exfat_uni_name *p_uniname) { @@ -553,6 +581,9 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, if (!ep) return -EIO; + if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC) + exfat_free_benign_secondary_clusters(inode, ep); + exfat_init_name_entry(ep, uniname); exfat_update_bh(bh, sync); brelse(bh); @@ -576,6 +607,9 @@ int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir, if (!ep) return -EIO; + if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC) + exfat_free_benign_secondary_clusters(inode, ep); + exfat_set_entry_type(ep, TYPE_DELETED); exfat_update_bh(bh, IS_DIRSYNC(inode)); brelse(bh); @@ -744,6 +778,7 @@ enum exfat_validate_dentry_mode { ES_MODE_GET_STRM_ENTRY, ES_MODE_GET_NAME_ENTRY, ES_MODE_GET_CRITICAL_SEC_ENTRY, + ES_MODE_GET_BENIGN_SEC_ENTRY, }; static bool exfat_validate_entry(unsigned int type, @@ -757,36 +792,33 @@ static bool exfat_validate_entry(unsigned int type, if (type != TYPE_FILE && type != TYPE_DIR) return false; *mode = ES_MODE_GET_FILE_ENTRY; - return true; + break; case ES_MODE_GET_FILE_ENTRY: if (type != TYPE_STREAM) return false; *mode = ES_MODE_GET_STRM_ENTRY; - return true; + break; case ES_MODE_GET_STRM_ENTRY: if (type != TYPE_EXTEND) return false; *mode = ES_MODE_GET_NAME_ENTRY; - return true; + break; case ES_MODE_GET_NAME_ENTRY: - if (type == TYPE_STREAM) - return false; - if (type != TYPE_EXTEND) { - if (!(type & TYPE_CRITICAL_SEC)) - return false; - *mode = ES_MODE_GET_CRITICAL_SEC_ENTRY; - } - return true; - case ES_MODE_GET_CRITICAL_SEC_ENTRY: - if (type == TYPE_EXTEND || type == TYPE_STREAM) + if (type & TYPE_BENIGN_SEC) + *mode = ES_MODE_GET_BENIGN_SEC_ENTRY; + else if (type != TYPE_EXTEND) return false; - if ((type & TYPE_CRITICAL_SEC) != TYPE_CRITICAL_SEC) + break; + case ES_MODE_GET_BENIGN_SEC_ENTRY: + /* Assume unreconized benign secondary entry */ + if (!(type & TYPE_BENIGN_SEC)) return false; - return true; + break; default: - WARN_ON_ONCE(1); return false; } + + return true; } struct exfat_dentry *exfat_get_dentry_cached( @@ -1167,10 +1199,8 @@ int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir, type = exfat_get_entry_type(ext_ep); brelse(bh); - if (type == TYPE_EXTEND || type == TYPE_STREAM) + if (type & TYPE_CRITICAL_SEC || type & TYPE_BENIGN_SEC) count++; - else - break; } return count; } diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index bc6d21d7c5ad..729ada9e26e8 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -50,7 +50,7 @@ enum { #define ES_IDX_LAST_FILENAME(name_len) \ (ES_IDX_FIRST_FILENAME + EXFAT_FILENAME_ENTRY_NUM(name_len) - 1) -#define DIR_DELETED 0xFFFF0321 +#define DIR_DELETED 0xFFFFFFF7 /* type values */ #define TYPE_UNUSED 0x0000 @@ -71,6 +71,8 @@ enum { #define TYPE_PADDING 0x0402 #define TYPE_ACLTAB 0x0403 #define TYPE_BENIGN_SEC 0x0800 +#define TYPE_VENDOR_EXT 0x0801 +#define TYPE_VENDOR_ALLOC 0x0802 #define MAX_CHARSET_SIZE 6 /* max size of multi-byte character */ #define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */ @@ -450,9 +452,9 @@ int exfat_trim_fs(struct inode *inode, struct fstrim_range *range); extern const struct file_operations exfat_file_operations; int __exfat_truncate(struct inode *inode); void exfat_truncate(struct inode *inode); -int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path, +int exfat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, unsigned int request_mask, unsigned int query_flags); int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h index 7f39b1c6469c..0ece2e43cf49 100644 --- a/fs/exfat/exfat_raw.h +++ b/fs/exfat/exfat_raw.h @@ -27,6 +27,7 @@ ((sbi)->num_clusters - EXFAT_RESERVED_CLUSTERS) /* AllocationPossible and NoFatChain field in GeneralSecondaryFlags Field */ +#define ALLOC_POSSIBLE 0x01 #define ALLOC_FAT_CHAIN 0x01 #define ALLOC_NO_FAT_CHAIN 0x03 @@ -50,6 +51,8 @@ #define EXFAT_STREAM 0xC0 /* stream entry */ #define EXFAT_NAME 0xC1 /* file name entry */ #define EXFAT_ACL 0xC2 /* stream entry */ +#define EXFAT_VENDOR_EXT 0xE0 /* vendor extension entry */ +#define EXFAT_VENDOR_ALLOC 0xE1 /* vendor allocation entry */ #define IS_EXFAT_CRITICAL_PRI(x) (x < 0xA0) #define IS_EXFAT_BENIGN_PRI(x) (x < 0xC0) @@ -155,6 +158,24 @@ struct exfat_dentry { __le32 start_clu; __le64 size; } __packed upcase; /* up-case table directory entry */ + struct { + __u8 flags; + __u8 vendor_guid[16]; + __u8 vendor_defined[14]; + } __packed vendor_ext; /* vendor extension directory entry */ + struct { + __u8 flags; + __u8 vendor_guid[16]; + __u8 vendor_defined[2]; + __le32 start_clu; + __le64 size; + } __packed vendor_alloc; /* vendor allocation directory entry */ + struct { + __u8 flags; + __u8 custom_defined[18]; + __le32 start_clu; + __le64 size; + } __packed generic_secondary; /* generic secondary directory entry */ } __packed dentry; } __packed; diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 41ae4cce1f42..56b870d9cc0d 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -307,7 +307,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, struct exfat_chain *p_chain, bool sync_bmap) { int ret = -ENOSPC; - unsigned int num_clusters = 0, total_cnt; + unsigned int total_cnt; unsigned int hint_clu, new_clu, last_clu = EXFAT_EOF_CLUSTER; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -344,17 +344,11 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, /* check cluster validation */ if (!is_valid_cluster(sbi, hint_clu)) { - exfat_err(sb, "hint_cluster is invalid (%u)", - hint_clu); + if (hint_clu != sbi->num_clusters) + exfat_err(sb, "hint_cluster is invalid (%u), rewind to the first cluster", + hint_clu); hint_clu = EXFAT_FIRST_CLUSTER; - if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { - if (exfat_chain_cont_cluster(sb, p_chain->dir, - num_clusters)) { - ret = -EIO; - goto unlock; - } - p_chain->flags = ALLOC_FAT_CHAIN; - } + p_chain->flags = ALLOC_FAT_CHAIN; } p_chain->dir = EXFAT_EOF_CLUSTER; @@ -364,7 +358,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, if (new_clu != hint_clu && p_chain->flags == ALLOC_NO_FAT_CHAIN) { if (exfat_chain_cont_cluster(sb, p_chain->dir, - num_clusters)) { + p_chain->size)) { ret = -EIO; goto free_cluster; } @@ -377,8 +371,6 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, goto free_cluster; } - num_clusters++; - /* update FAT table */ if (p_chain->flags == ALLOC_FAT_CHAIN) { if (exfat_ent_set(sb, new_clu, EXFAT_EOF_CLUSTER)) { @@ -395,13 +387,14 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, goto free_cluster; } } + p_chain->size++; + last_clu = new_clu; - if (--num_alloc == 0) { + if (p_chain->size == num_alloc) { sbi->clu_srch_ptr = hint_clu; - sbi->used_clusters += num_clusters; + sbi->used_clusters += num_alloc; - p_chain->size += num_clusters; mutex_unlock(&sbi->bitmap_lock); return 0; } @@ -412,7 +405,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { if (exfat_chain_cont_cluster(sb, p_chain->dir, - num_clusters)) { + p_chain->size)) { ret = -EIO; goto free_cluster; } @@ -421,8 +414,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, } } free_cluster: - if (num_clusters) - __exfat_free_cluster(inode, p_chain); + __exfat_free_cluster(inode, p_chain); unlock: mutex_unlock(&sbi->bitmap_lock); return ret; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index f5b29072775d..e99183a74611 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -209,8 +209,7 @@ void exfat_truncate(struct inode *inode) if (err) goto write_size; - inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> - inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; write_size: aligned_size = i_size_read(inode); if (aligned_size & (blocksize - 1)) { @@ -226,14 +225,14 @@ write_size: mutex_unlock(&sbi->s_lock); } -int exfat_getattr(struct user_namespace *mnt_uerns, const struct path *path, +int exfat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, unsigned int request_mask, unsigned int query_flags) { struct inode *inode = d_backing_inode(path->dentry); struct exfat_inode_info *ei = EXFAT_I(inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); exfat_truncate_atime(&stat->atime); stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; @@ -242,7 +241,7 @@ int exfat_getattr(struct user_namespace *mnt_uerns, const struct path *path, return 0; } -int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb); @@ -266,7 +265,7 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ATTR_TIMES_SET); } - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); attr->ia_valid = ia_valid; if (error) goto out; @@ -293,7 +292,7 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (attr->ia_valid & ATTR_SIZE) inode->i_mtime = inode->i_ctime = current_time(inode); - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); exfat_truncate_atime(&inode->i_atime); if (attr->ia_valid & ATTR_SIZE) { diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 5b644cb057fa..481dd338f2b8 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -220,8 +220,7 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, num_clusters += num_to_be_allocated; *clu = new_clu.dir; - inode->i_blocks += - num_to_be_allocated << sbi->sect_per_clus_bits; + inode->i_blocks += EXFAT_CLU_TO_B(num_to_be_allocated, sbi) >> 9; /* * Move *clu pointer along FAT chains (hole care) because the @@ -576,8 +575,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) exfat_save_attr(inode, info->attr); - inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> - inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; inode->i_mtime = info->mtime; inode->i_ctime = info->mtime; ei->i_crtime = info->crtime; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 5f995eba5dbb..e0ff9d156f6f 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -396,7 +396,7 @@ static int exfat_find_empty_entry(struct inode *inode, ei->i_size_ondisk += sbi->cluster_size; ei->i_size_aligned += sbi->cluster_size; ei->flags = p_dir->flags; - inode->i_blocks += 1 << sbi->sect_per_clus_bits; + inode->i_blocks += sbi->cluster_size >> 9; } return dentry; @@ -551,7 +551,7 @@ out: return ret; } -static int exfat_create(struct user_namespace *mnt_userns, struct inode *dir, +static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; @@ -834,7 +834,7 @@ unlock: return err; } -static int exfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; @@ -1285,7 +1285,7 @@ out: return ret; } -static int exfat_rename(struct user_namespace *mnt_userns, +static int exfat_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 35f0305cd493..8c32460e031e 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -373,8 +373,7 @@ static int exfat_read_root(struct inode *inode) inode->i_op = &exfat_dir_inode_operations; inode->i_fop = &exfat_dir_operations; - inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> - inode->i_blkbits; + inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; ei->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff; ei->i_size_aligned = i_size_read(inode); ei->i_size_ondisk = i_size_read(inode); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 3204bd33e4e8..ab88d33d106c 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -145,7 +145,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf)); + tmp = lookup_one_unlocked(mnt_idmap(mnt), nbuf, parent, strlen(nbuf)); if (IS_ERR(tmp)) { dprintk("lookup failed: %ld\n", PTR_ERR(tmp)); err = PTR_ERR(tmp); @@ -524,7 +524,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, } inode_lock(target_dir->d_inode); - nresult = lookup_one(mnt_user_ns(mnt), nbuf, + nresult = lookup_one(mnt_idmap(mnt), nbuf, target_dir, strlen(nbuf)); if (!IS_ERR(nresult)) { if (unlikely(nresult->d_inode != result->d_inode)) { diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 1248ff4ef562..77393fda99af 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -2,6 +2,7 @@ config EXT2_FS tristate "Second extended fs support" select FS_IOMAP + select LEGACY_DIRECT_IO help Ext2 is a standard Linux file system for hard disks. diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 440d5f1e9d47..82b17d7fc93f 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -219,7 +219,7 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) * inode->i_mutex: down */ int -ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +ext2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error; @@ -228,7 +228,7 @@ ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - error = posix_acl_update_mode(&init_user_ns, inode, &mode, + error = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (error) return error; diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 3841becb94ff..4a8443a2b8ec 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -56,7 +56,7 @@ static inline int ext2_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu); -extern int ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int ext2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index e5cbc27ba459..4a6955a0a116 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -461,9 +461,9 @@ static int ext2_handle_dirsync(struct inode *dir) return err; } -void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, - struct page *page, void *page_addr, struct inode *inode, - int update_times) +int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, + struct page *page, void *page_addr, struct inode *inode, + bool update_times) { loff_t pos = page_offset(page) + (char *) de - (char *) page_addr; @@ -472,7 +472,10 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, lock_page(page); err = ext2_prepare_chunk(page, pos, len); - BUG_ON(err); + if (err) { + unlock_page(page); + return err; + } de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type(de, inode); ext2_commit_chunk(page, pos, len); @@ -480,7 +483,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, dir->i_mtime = dir->i_ctime = current_time(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); - ext2_handle_dirsync(dir); + return ext2_handle_dirsync(dir); } /* @@ -646,7 +649,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent) unlock_page(page); goto fail; } - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); memset(kaddr, 0, chunk_size); de = (struct ext2_dir_entry_2 *)kaddr; de->name_len = 1; @@ -661,7 +664,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent) de->inode = cpu_to_le32(parent->i_ino); memcpy (de->name, "..\0", 4); ext2_set_de_type (de, inode); - kunmap_atomic(kaddr); + kunmap_local(kaddr); ext2_commit_chunk(page, 0, chunk_size); err = ext2_handle_dirsync(inode); fail: diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 28de11a22e5f..cb78d7dcfb95 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -734,8 +734,9 @@ extern int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page, char *kaddr); extern int ext2_empty_dir (struct inode *); extern struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p, void **pa); -extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, void *, - struct inode *, int); +int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, + struct page *page, void *page_addr, struct inode *inode, + bool update_times); static inline void ext2_put_page(struct page *page, void *page_addr) { kunmap_local(page_addr); @@ -753,8 +754,8 @@ extern struct inode *ext2_iget (struct super_block *, unsigned long); extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_evict_inode(struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern int ext2_setattr (struct user_namespace *, struct dentry *, struct iattr *); -extern int ext2_getattr (struct user_namespace *, const struct path *, +extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); +extern int ext2_getattr (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext2_set_inode_flags(struct inode *inode); extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -762,7 +763,7 @@ extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* ioctl.c */ extern int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa); -extern int ext2_fileattr_set(struct user_namespace *mnt_userns, +extern int ext2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); extern long ext2_ioctl(struct file *, unsigned int, unsigned long); extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 78b8686d9a4a..a4e1d7a9c544 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -545,7 +545,7 @@ got: inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; } else - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 69aed9e2359e..26f135e7ffce 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1592,7 +1592,7 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } -int ext2_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ext2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -1614,28 +1614,28 @@ int ext2_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; } -int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ext2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, iattr); + error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) return error; - if (is_quota_modification(mnt_userns, inode, iattr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, iattr)) { error = dquot_initialize(inode); if (error) return error; } - if (i_uid_needs_update(mnt_userns, iattr, inode) || - i_gid_needs_update(mnt_userns, iattr, inode)) { - error = dquot_transfer(mnt_userns, inode, iattr); + if (i_uid_needs_update(&nop_mnt_idmap, iattr, inode) || + i_gid_needs_update(&nop_mnt_idmap, iattr, inode)) { + error = dquot_transfer(&nop_mnt_idmap, inode, iattr); if (error) return error; } @@ -1644,9 +1644,9 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); if (iattr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + error = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); mark_inode_dirty(inode); return error; diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index e8340bf09b10..cc87d413eb43 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -27,7 +27,7 @@ int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int ext2_fileattr_set(struct user_namespace *mnt_userns, +int ext2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -66,7 +66,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case EXT2_IOC_SETVERSION: { __u32 generation; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EPERM; ret = mnt_want_write_file(filp); if (ret) @@ -99,7 +99,7 @@ setversion_out: if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EACCES; if (get_user(rsv_window_size, (int __user *)arg)) diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index c056957221a2..7f5dfa87cc95 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -99,7 +99,7 @@ struct dentry *ext2_get_parent(struct dentry *child) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext2_create (struct user_namespace * mnt_userns, +static int ext2_create (struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { @@ -119,7 +119,7 @@ static int ext2_create (struct user_namespace * mnt_userns, return ext2_add_nondir(dentry, inode); } -static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int ext2_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode = ext2_new_inode(dir, mode, NULL); @@ -133,7 +133,7 @@ static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return finish_open_simple(file, 0); } -static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir, +static int ext2_mknod (struct mnt_idmap * idmap, struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; @@ -154,7 +154,7 @@ static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir, return err; } -static int ext2_symlink (struct user_namespace * mnt_userns, struct inode * dir, +static int ext2_symlink (struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; @@ -225,7 +225,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, return err; } -static int ext2_mkdir(struct user_namespace * mnt_userns, +static int ext2_mkdir(struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode) { struct inode * inode; @@ -315,7 +315,7 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry) return err; } -static int ext2_rename (struct user_namespace * mnt_userns, +static int ext2_rename (struct mnt_idmap * idmap, struct inode * old_dir, struct dentry * old_dentry, struct inode * new_dir, struct dentry * new_dentry, unsigned int flags) @@ -370,8 +370,11 @@ static int ext2_rename (struct user_namespace * mnt_userns, err = PTR_ERR(new_de); goto out_dir; } - ext2_set_link(new_dir, new_de, new_page, page_addr, old_inode, 1); + err = ext2_set_link(new_dir, new_de, new_page, page_addr, + old_inode, true); ext2_put_page(new_page, page_addr); + if (err) + goto out_dir; new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); @@ -394,24 +397,24 @@ static int ext2_rename (struct user_namespace * mnt_userns, ext2_delete_entry(old_de, old_page, old_page_addr); if (dir_de) { - if (old_dir != new_dir) - ext2_set_link(old_inode, dir_de, dir_page, - dir_page_addr, new_dir, 0); + if (old_dir != new_dir) { + err = ext2_set_link(old_inode, dir_de, dir_page, + dir_page_addr, new_dir, false); + } ext2_put_page(dir_page, dir_page_addr); inode_dec_link_count(old_dir); } +out_old: ext2_put_page(old_page, old_page_addr); - return 0; +out: + return err; out_dir: if (dir_de) ext2_put_page(dir_page, dir_page_addr); -out_old: - ext2_put_page(old_page, old_page_addr); -out: - return err; + goto out_old; } const struct inode_operations ext2_dir_inode_operations = { diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index ebade1f52451..db47b8ab153e 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -19,7 +19,7 @@ ext2_xattr_security_get(const struct xattr_handler *handler, static int ext2_xattr_security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 18a87d5dd1ab..995f931228ce 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -26,7 +26,7 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler, static int ext2_xattr_trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 58092449f8ff..dd1507231081 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -30,7 +30,7 @@ ext2_xattr_user_get(const struct xattr_handler *handler, static int ext2_xattr_user_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index a9f89539aeee..27fcbddfb148 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -225,7 +225,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { handle_t *handle; @@ -249,7 +249,7 @@ retry: return PTR_ERR(handle); if ((type == ACL_TYPE_ACCESS) && acl) { - error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); + error = posix_acl_update_mode(idmap, inode, &mode, &acl); if (error) goto out_stop; if (mode != inode->i_mode) diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 09c4a8a3b716..0c5a79c3b5d4 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -56,7 +56,7 @@ static inline int ext4_acl_count(size_t size) /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu); -int ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 140e1eb300d1..4eeb02d456a9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1529,6 +1529,7 @@ struct ext4_sb_info { unsigned int s_mount_opt2; unsigned long s_mount_flags; unsigned int s_def_mount_opt; + unsigned int s_def_mount_opt2; ext4_fsblk_t s_sb_block; atomic64_t s_resv_clusters; kuid_t s_resuid; @@ -2845,7 +2846,7 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, /* ialloc.c */ extern int ext4_mark_inode_used(struct super_block *sb, int ino); -extern struct inode *__ext4_new_inode(struct user_namespace *, handle_t *, +extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, @@ -2853,11 +2854,11 @@ extern struct inode *__ext4_new_inode(struct user_namespace *, handle_t *, int nblocks); #define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ - __ext4_new_inode(&init_user_ns, (handle), (dir), (mode), (qstr), \ + __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr), \ (goal), (owner), i_flags, 0, 0, 0) -#define ext4_new_inode_start_handle(mnt_userns, dir, mode, qstr, goal, owner, \ +#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \ type, nblocks) \ - __ext4_new_inode((mnt_userns), NULL, (dir), (mode), (qstr), (goal), (owner), \ + __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \ 0, (type), __LINE__, (nblocks)) @@ -2976,14 +2977,14 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, __ext4_iget((sb), (ino), (flags), __func__, __LINE__) extern int ext4_write_inode(struct inode *, struct writeback_control *); -extern int ext4_setattr(struct user_namespace *, struct dentry *, +extern int ext4_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern u32 ext4_dio_alignment(struct inode *inode); -extern int ext4_getattr(struct user_namespace *, const struct path *, +extern int ext4_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); -extern int ext4_file_getattr(struct user_namespace *, const struct path *, +extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); @@ -3024,7 +3025,7 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); -int ext4_fileattr_set(struct user_namespace *mnt_userns, +int ext4_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9de1c9d1a13d..3559ea6b0781 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3251,7 +3251,7 @@ static int ext4_split_extent_at(handle_t *handle, ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); - if (err != -ENOSPC && err != -EDQUOT) + if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) goto out; if (EXT4_EXT_MAY_ZEROOUT & split_flag) { diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 4594b62f147b..b06de728b3b6 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1332,8 +1332,14 @@ struct dentry_info_args { char *dname; }; +/* Same as struct ext4_fc_tl, but uses native endianness fields */ +struct ext4_fc_tl_mem { + u16 fc_tag; + u16 fc_len; +}; + static inline void tl_to_darg(struct dentry_info_args *darg, - struct ext4_fc_tl *tl, u8 *val) + struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_dentry_info fcd; @@ -1345,16 +1351,18 @@ static inline void tl_to_darg(struct dentry_info_args *darg, darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); } -static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val) +static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) { - memcpy(tl, val, EXT4_FC_TAG_BASE_LEN); - tl->fc_len = le16_to_cpu(tl->fc_len); - tl->fc_tag = le16_to_cpu(tl->fc_tag); + struct ext4_fc_tl tl_disk; + + memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); + tl->fc_len = le16_to_cpu(tl_disk.fc_len); + tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); } /* Unlink replay function */ -static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +static int ext4_fc_replay_unlink(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode, *old_parent; struct qstr entry; @@ -1451,8 +1459,8 @@ out: } /* Link replay function */ -static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +static int ext4_fc_replay_link(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct dentry_info_args darg; @@ -1506,8 +1514,8 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) /* * Inode replay function */ -static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +static int ext4_fc_replay_inode(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_inode fc_inode; struct ext4_inode *raw_inode; @@ -1609,8 +1617,8 @@ out: * inode for which we are trying to create a dentry here, should already have * been replayed before we start here. */ -static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +static int ext4_fc_replay_create(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { int ret = 0; struct inode *inode = NULL; @@ -1708,7 +1716,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, /* Replay add range tag */ static int ext4_fc_replay_add_range(struct super_block *sb, - struct ext4_fc_tl *tl, u8 *val) + struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_add_range fc_add_ex; struct ext4_extent newex, *ex; @@ -1828,8 +1836,8 @@ out: /* Replay DEL_RANGE tag */ static int -ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, - u8 *val) +ext4_fc_replay_del_range(struct super_block *sb, + struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct ext4_fc_del_range lrange; @@ -2025,7 +2033,7 @@ static int ext4_fc_replay_scan(journal_t *journal, struct ext4_fc_replay_state *state; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_add_range ext; - struct ext4_fc_tl tl; + struct ext4_fc_tl_mem tl; struct ext4_fc_tail tail; __u8 *start, *end, *cur, *val; struct ext4_fc_head head; @@ -2144,7 +2152,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_fc_tl tl; + struct ext4_fc_tl_mem tl; __u8 *start, *end, *cur, *val; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7ac0a81bd371..0b8b4499e5ca 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len) return false; } -/* Is IO overwriting allocated and initialized blocks? */ -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) +/* Is IO overwriting allocated or initialized blocks? */ +static bool ext4_overwrite_io(struct inode *inode, + loff_t pos, loff_t len, bool *unwritten) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) blklen = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); + if (err != blklen) + return false; /* * 'err==len' means that all of the blocks have been preallocated, - * regardless of whether they have been initialized or not. To exclude - * unwritten extents, we need to check m_flags. + * regardless of whether they have been initialized or not. We need to + * check m_flags to distinguish the unwritten extents. */ - return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); + *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); + return true; } static ssize_t ext4_generic_write_checks(struct kiocb *iocb, @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { * - For extending writes case we don't take the shared lock, since it requires * updating inode i_disksize and/or orphan handling with exclusive lock. * - * - shared locking will only be true mostly with overwrites. Otherwise we will - * switch to exclusive i_rwsem lock. + * - shared locking will only be true mostly with overwrites, including + * initialized blocks and unwritten blocks. For overwrite unwritten blocks + * we protect splitting extents by i_data_sem in ext4_inode_info, so we can + * also release exclusive i_rwsem lock. + * + * - Otherwise we will switch to exclusive i_rwsem lock. */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, - bool *ilock_shared, bool *extend) + bool *ilock_shared, bool *extend, + bool *unwritten) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -459,7 +468,7 @@ restart: * in file_modified(). */ if (*ilock_shared && (!IS_NOSEC(inode) || *extend || - !ext4_overwrite_io(inode, offset, count))) { + !ext4_overwrite_io(inode, offset, count, unwritten))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; - bool extend = false, unaligned_io = false; + bool extend = false, unaligned_io = false, unwritten = false; bool ilock_shared = true; /* @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_buffered_write_iter(iocb, from); } - ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); + ret = ext4_dio_write_checks(iocb, from, + &ilock_shared, &extend, &unwritten); if (ret <= 0) return ret; @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_journal_stop(handle); } - if (ilock_shared) + if (ilock_shared && !unwritten) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, @@ -801,7 +811,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); } else { vma->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 63f9bb6e8851..157663031f8c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -921,7 +921,7 @@ static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *__ext4_new_inode(struct user_namespace *mnt_userns, +struct inode *__ext4_new_inode(struct mnt_idmap *idmap, handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, @@ -972,10 +972,10 @@ struct inode *__ext4_new_inode(struct user_namespace *mnt_userns, i_gid_write(inode, owner[1]); } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; - inode_fsuid_set(inode, mnt_userns); + inode_fsuid_set(inode, idmap); inode->i_gid = dir->i_gid; } else - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9d9f414f99fe..d251d705c276 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -786,11 +786,10 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) * once we get rid of using bh as a container for mapping information * to pass to / from get_block functions, this can go away. */ + old_state = READ_ONCE(bh->b_state); do { - old_state = READ_ONCE(bh->b_state); new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; - } while (unlikely( - cmpxchg(&bh->b_state, old_state, new_state) != old_state)); + } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } static int _ext4_get_block(struct inode *inode, sector_t iblock, @@ -1136,7 +1135,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, for (i = 0; i < nr_wait; i++) { int err2; - err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize, + err2 = fscrypt_decrypt_pagecache_blocks(page_folio(page), + blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); @@ -2595,8 +2595,8 @@ static bool ext4_page_nomap_can_writeout(struct page *page) static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; - struct pagevec pvec; - unsigned int nr_pages; + struct folio_batch fbatch; + unsigned int nr_folios; long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; @@ -2610,18 +2610,17 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; - - pagevec_init(&pvec); + folio_batch_init(&fbatch); mpd->map.m_len = 0; mpd->next_page = index; while (index <= end) { - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); - if (nr_pages == 0) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; /* * Accumulated enough dirty pages? This doesn't apply @@ -2635,10 +2634,10 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; /* If we can't merge this page, we are done. */ - if (mpd->map.m_len > 0 && mpd->next_page != page->index) + if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; - lock_page(page); + folio_lock(folio); /* * If the page is no longer dirty, or its mapping no * longer corresponds to inode we are writing (which @@ -2646,16 +2645,16 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * page is already under writeback and we are not doing * a data integrity writeback, skip the page */ - if (!PageDirty(page) || - (PageWriteback(page) && + if (!folio_test_dirty(folio) || + (folio_test_writeback(folio) && (mpd->wbc->sync_mode == WB_SYNC_NONE)) || - unlikely(page->mapping != mapping)) { - unlock_page(page); + unlikely(folio->mapping != mapping)) { + folio_unlock(folio); continue; } - wait_on_page_writeback(page); - BUG_ON(PageWriteback(page)); + folio_wait_writeback(folio); + BUG_ON(folio_test_writeback(folio)); /* * Should never happen but for buggy code in @@ -2666,56 +2665,56 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz */ - if (!page_has_buffers(page)) { - ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index); - ClearPageDirty(page); - unlock_page(page); + if (!folio_buffers(folio)) { + ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); + folio_clear_dirty(folio); + folio_unlock(folio); continue; } if (mpd->map.m_len == 0) - mpd->first_page = page->index; - mpd->next_page = page->index + 1; + mpd->first_page = folio->index; + mpd->next_page = folio->index + folio_nr_pages(folio); /* * Writeout for transaction commit where we cannot * modify metadata is simple. Just submit the page. */ if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(page)) { - err = mpage_submit_page(mpd, page); + if (ext4_page_nomap_can_writeout(&folio->page)) { + err = mpage_submit_page(mpd, &folio->page); if (err < 0) goto out; } else { - unlock_page(page); - mpd->first_page++; + folio_unlock(folio); + mpd->first_page += folio_nr_pages(folio); } } else { /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)page->index) << + lblk = ((ext4_lblk_t)folio->index) << (PAGE_SHIFT - blkbits); - head = page_buffers(page); + head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, - lblk); + lblk); if (err <= 0) goto out; err = 0; } - left--; + left -= folio_nr_pages(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; return 0; out: - pagevec_release(&pvec); + folio_batch_release(&fbatch); return err; } -static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc, +static int ext4_writepage_cb(struct folio *folio, struct writeback_control *wbc, void *data) { - return ext4_writepage(page, wbc); + return ext4_writepage(&folio->page, wbc); } static int ext4_do_writepages(struct mpage_da_data *mpd) @@ -3858,7 +3857,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); - err = fscrypt_decrypt_pagecache_blocks(page, blocksize, + err = fscrypt_decrypt_pagecache_blocks(page_folio(page), + blocksize, bh_offset(bh)); if (err) { clear_buffer_uptodate(bh); @@ -4872,13 +4872,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, goto bad_inode; raw_inode = ext4_raw_inode(&iloc); - if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { - ext4_error_inode(inode, function, line, 0, - "iget: root inode unallocated"); - ret = -EFSCORRUPTED; - goto bad_inode; - } - if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; @@ -4951,11 +4944,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { - if ((inode->i_mode == 0 || + if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { - /* this inode is deleted */ - ret = -ESTALE; + /* this inode is deleted or unallocated */ + if (flags & EXT4_IGET_SPECIAL) { + ext4_error_inode(inode, function, line, 0, + "iget: special inode unallocated"); + ret = -EFSCORRUPTED; + } else + ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have @@ -5434,7 +5432,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * * Called with inode->i_rwsem down. */ -int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -5454,7 +5452,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; - error = setattr_prepare(mnt_userns, dentry, attr); + error = setattr_prepare(idmap, dentry, attr); if (error) return error; @@ -5466,14 +5464,14 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } - if (i_uid_needs_update(mnt_userns, attr, inode) || - i_gid_needs_update(mnt_userns, attr, inode)) { + if (i_uid_needs_update(idmap, attr, inode) || + i_gid_needs_update(idmap, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, @@ -5490,7 +5488,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); - error = dquot_transfer(mnt_userns, inode, attr); + error = dquot_transfer(idmap, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { @@ -5499,8 +5497,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Update corresponding info in inode so that everything is in * one transaction */ - i_uid_update(mnt_userns, attr, inode); - i_gid_update(mnt_userns, attr, inode); + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { @@ -5630,7 +5628,7 @@ out_mmap_sem: if (!error) { if (inc_ivers) inode_inc_iversion(inode); - setattr_copy(mnt_userns, inode, attr); + setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); } @@ -5642,7 +5640,7 @@ out_mmap_sem: ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); + rc = posix_acl_chmod(idmap, dentry, inode->i_mode); err_out: if (error) @@ -5668,7 +5666,7 @@ u32 ext4_dio_alignment(struct inode *inode) return 1; /* use the iomap defaults */ } -int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -5725,18 +5723,18 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); return 0; } -int ext4_file_getattr(struct user_namespace *mnt_userns, +int ext4_file_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); u64 delalloc_blocks; - ext4_getattr(mnt_userns, path, stat, request_mask, query_flags); + ext4_getattr(idmap, path, stat, request_mask, query_flags); /* * If there is inline data in the inode, the inode will normally not @@ -5788,7 +5786,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; int idxblocks; - int ret = 0; + int ret; /* * How many index blocks need to touch to map @lblocks logical blocks diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 8067ccda34e4..12435d61f09e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -155,9 +155,6 @@ static int ext4_update_backup_sb(struct super_block *sb, set_buffer_uptodate(bh); unlock_buffer(bh); - if (err) - goto out_bh; - if (handle) { err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) @@ -358,12 +355,12 @@ void ext4_reset_inode_seed(struct inode *inode) * important fields of the inodes. * * @sb: the super block of the filesystem - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the inode to swap with EXT4_BOOT_LOADER_INO * */ static long swap_inode_boot_loader(struct super_block *sb, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *inode) { handle_t *handle; @@ -393,7 +390,7 @@ static long swap_inode_boot_loader(struct super_block *sb, } if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || - !inode_owner_or_capable(mnt_userns, inode) || + !inode_owner_or_capable(idmap, inode) || !capable(CAP_SYS_ADMIN)) { err = -EPERM; goto journal_err_out; @@ -979,7 +976,7 @@ int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int ext4_fileattr_set(struct user_namespace *mnt_userns, +int ext4_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -1217,7 +1214,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); @@ -1234,7 +1231,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) __u32 generation; int err; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (ext4_has_metadata_csum(inode->i_sb)) { @@ -1376,7 +1373,7 @@ mext_out: case EXT4_IOC_MIGRATE: { int err; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; err = mnt_want_write_file(filp); @@ -1398,7 +1395,7 @@ mext_out: case EXT4_IOC_ALLOC_DA_BLKS: { int err; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; err = mnt_want_write_file(filp); @@ -1417,7 +1414,7 @@ mext_out: err = mnt_want_write_file(filp); if (err) return err; - err = swap_inode_boot_loader(sb, mnt_userns, inode); + err = swap_inode_boot_loader(sb, idmap, inode); mnt_drop_write_file(filp); return err; } @@ -1542,7 +1539,7 @@ resizefs_out: case EXT4_IOC_CLEAR_ES_CACHE: { - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; ext4_clear_inode_es(inode); return 0; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 8dbb87edf24c..2de9829aed63 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -110,22 +110,23 @@ out: } /** - * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 + * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure - * @index1: page index - * @index2: page index - * @page: result page vector + * @index1: folio index + * @index2: folio index + * @folio: result folio vector * - * Grab two locked pages for inode's by inode order + * Grab two locked folio for inode's by inode order */ static int -mext_page_double_lock(struct inode *inode1, struct inode *inode2, - pgoff_t index1, pgoff_t index2, struct page *page[2]) +mext_folio_double_lock(struct inode *inode1, struct inode *inode2, + pgoff_t index1, pgoff_t index2, struct folio *folio[2]) { struct address_space *mapping[2]; unsigned int flags; + unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; BUG_ON(!inode1 || !inode2); if (inode1 < inode2) { @@ -138,28 +139,30 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, } flags = memalloc_nofs_save(); - page[0] = grab_cache_page_write_begin(mapping[0], index1); - if (!page[0]) { + folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, + mapping_gfp_mask(mapping[0])); + if (!folio[0]) { memalloc_nofs_restore(flags); return -ENOMEM; } - page[1] = grab_cache_page_write_begin(mapping[1], index2); + folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, + mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); - if (!page[1]) { - unlock_page(page[0]); - put_page(page[0]); + if (!folio[1]) { + folio_unlock(folio[0]); + folio_put(folio[0]); return -ENOMEM; } /* - * grab_cache_page_write_begin() may not wait on page's writeback if + * __filemap_get_folio() may not wait on folio's writeback if * BDI not demand that. But it is reasonable to be very conservative - * here and explicitly wait on page's writeback + * here and explicitly wait on folio's writeback */ - wait_on_page_writeback(page[0]); - wait_on_page_writeback(page[1]); + folio_wait_writeback(folio[0]); + folio_wait_writeback(folio[1]); if (inode1 > inode2) - swap(page[0], page[1]); + swap(folio[0], folio[1]); return 0; } @@ -252,7 +255,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); - struct page *pagep[2] = {NULL, NULL}; struct folio *folio[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; @@ -303,8 +305,8 @@ again: replaced_size = data_size; - *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, - donor_page_offset, pagep); + *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset, + donor_page_offset, folio); if (unlikely(*err < 0)) goto stop_journal; /* @@ -314,8 +316,6 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ - folio[0] = page_folio(pagep[0]); - folio[1] = page_folio(pagep[1]); VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index dd28453d6ea3..94608b7df7e8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2792,7 +2792,7 @@ static int ext4_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext4_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; @@ -2806,7 +2806,7 @@ static int ext4_create(struct user_namespace *mnt_userns, struct inode *dir, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name, + inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); @@ -2827,7 +2827,7 @@ retry: return err; } -static int ext4_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; @@ -2841,7 +2841,7 @@ static int ext4_mknod(struct user_namespace *mnt_userns, struct inode *dir, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name, + inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); @@ -2861,7 +2861,7 @@ retry: return err; } -static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { handle_t *handle; @@ -2873,7 +2873,7 @@ static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return err; retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, + inode = ext4_new_inode_start_handle(idmap, dir, mode, NULL, 0, NULL, EXT4_HT_DIR, EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + @@ -2972,7 +2972,7 @@ out: return err; } -static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; @@ -2989,7 +2989,7 @@ static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFDIR | mode, + inode = ext4_new_inode_start_handle(idmap, dir, S_IFDIR | mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); @@ -3339,7 +3339,7 @@ out: return err; } -static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { handle_t *handle; @@ -3370,7 +3370,7 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; retry: - inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFLNK|S_IRWXUGO, + inode = ext4_new_inode_start_handle(idmap, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); @@ -3720,7 +3720,7 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) } } -static struct inode *ext4_whiteout_for_rename(struct user_namespace *mnt_userns, +static struct inode *ext4_whiteout_for_rename(struct mnt_idmap *idmap, struct ext4_renament *ent, int credits, handle_t **h) { @@ -3735,7 +3735,7 @@ static struct inode *ext4_whiteout_for_rename(struct user_namespace *mnt_userns, credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + EXT4_XATTR_TRANS_BLOCKS + 4); retry: - wh = ext4_new_inode_start_handle(mnt_userns, ent->dir, + wh = ext4_new_inode_start_handle(idmap, ent->dir, S_IFCHR | WHITEOUT_MODE, &ent->dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -3763,7 +3763,7 @@ retry: * while new_{dentry,inode) refers to the destination dentry/inode * This comes from rename(const char *oldpath, const char *newpath) */ -static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -3851,7 +3851,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto release_bh; } } else { - whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle); + whiteout = ext4_whiteout_for_rename(idmap, &old, credits, &handle); if (IS_ERR(whiteout)) { retval = PTR_ERR(whiteout); goto release_bh; @@ -3872,9 +3872,16 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } + /* + * We need to protect against old.inode directory getting + * converted from inline directory format into a normal one. + */ + inode_lock_nested(old.inode, I_MUTEX_NONDIR2); retval = ext4_rename_dir_prepare(handle, &old); - if (retval) + if (retval) { + inode_unlock(old.inode); goto end_rename; + } } /* * If we're renaming a file within an inline_data dir and adding or @@ -4006,6 +4013,8 @@ end_rename: } else { ext4_journal_stop(handle); } + if (old.dir_bh) + inode_unlock(old.inode); release_bh: brelse(old.dir_bh); brelse(old.bh); @@ -4158,7 +4167,7 @@ end_rename: return retval; } -static int ext4_rename2(struct user_namespace *mnt_userns, +static int ext4_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -4181,7 +4190,7 @@ static int ext4_rename2(struct user_namespace *mnt_userns, new_dir, new_dentry); } - return ext4_rename(mnt_userns, old_dir, old_dentry, new_dir, new_dentry, flags); + return ext4_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); } /* diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index d5266932ce6c..c61dc8a7c014 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -211,8 +211,7 @@ static void ext4_set_bio_post_read_ctx(struct bio *bio, static inline loff_t ext4_readpage_limit(struct inode *inode) { - if (IS_ENABLED(CONFIG_FS_VERITY) && - (IS_VERITY(inode) || ext4_verity_in_progress(inode))) + if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 260c1b3e3ef2..88f7b8a88c76 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -482,7 +482,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) * * However, we may have to redirty a page (see below.) */ -static int ext4_journalled_writepage_callback(struct page *page, +static int ext4_journalled_writepage_callback(struct folio *folio, struct writeback_control *wbc, void *data) { @@ -490,7 +490,7 @@ static int ext4_journalled_writepage_callback(struct page *page, struct buffer_head *bh, *head; struct journal_head *jh; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { /* * We have to redirty a page in these cases: @@ -509,7 +509,7 @@ static int ext4_journalled_writepage_callback(struct page *page, if (buffer_dirty(bh) || (jh && (jh->b_transaction != transaction || jh->b_next_transaction))) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); goto out; } } while ((bh = bh->b_this_page) != head); @@ -2146,7 +2146,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; case Opt_commit: if (result.uint_32 == 0) - ctx->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE; + result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE; else if (result.uint_32 > INT_MAX / HZ) { ext4_msg(NULL, KERN_ERR, "Invalid commit interval %d, " @@ -2635,7 +2635,6 @@ static int ext4_check_test_dummy_encryption(const struct fs_context *fc, { const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; @@ -2668,17 +2667,7 @@ static int ext4_check_test_dummy_encryption(const struct fs_context *fc, "Conflicting test_dummy_encryption options"); return -EINVAL; } - /* - * fscrypt_add_test_dummy_key() technically changes the super_block, so - * technically it should be delayed until ext4_apply_options() like the - * other changes. But since we never get here for remounts (see above), - * and this is the last chance to report errors, we do it here. - */ - err = fscrypt_add_test_dummy_key(sb, &ctx->dummy_enc_policy); - if (err) - ext4_msg(NULL, KERN_WARNING, - "Error adding test dummy encryption key [%d]", err); - return err; + return 0; } static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, @@ -2894,7 +2883,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - int def_errors, def_mount_opt = sbi->s_def_mount_opt; + int def_errors; const struct mount_opts *m; char sep = nodefs ? '\n' : ','; @@ -2906,15 +2895,28 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; + int opt_2 = m->flags & MOPT_2; + unsigned int mount_opt, def_mount_opt; + if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || m->flags & MOPT_SKIP) continue; - if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) - continue; /* skip if same as the default */ + + if (opt_2) { + mount_opt = sbi->s_mount_opt2; + def_mount_opt = sbi->s_def_mount_opt2; + } else { + mount_opt = sbi->s_mount_opt; + def_mount_opt = sbi->s_def_mount_opt; + } + /* skip if same as the default */ + if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt))) + continue; + /* select Opt_noFoo vs Opt_Foo */ if ((want_set && - (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || - (!want_set && (sbi->s_mount_opt & m->mount_opt))) - continue; /* select Opt_noFoo vs Opt_Foo */ + (mount_opt & m->mount_opt) != m->mount_opt) || + (!want_set && (mount_opt & m->mount_opt))) + continue; SEQ_OPTS_PRINT("%s", token2str(m->token)); } @@ -2942,7 +2944,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & - (sbi->s_mount_opt ^ def_mount_opt)) { + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) SEQ_OPTS_PUTS("data=journal"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) @@ -4738,7 +4740,6 @@ static int ext4_group_desc_init(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int db_count; ext4_fsblk_t block; - int ret; int i; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / @@ -4778,8 +4779,7 @@ static int ext4_group_desc_init(struct super_block *sb, ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); sbi->s_gdb_count = i; - ret = PTR_ERR(bh); - goto out; + return PTR_ERR(bh); } rcu_read_lock(); rcu_dereference(sbi->s_group_desc)[i] = bh; @@ -4788,13 +4788,10 @@ static int ext4_group_desc_init(struct super_block *sb, sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - ret = -EFSCORRUPTED; - goto out; + return -EFSCORRUPTED; } + return 0; -out: - ext4_group_desc_free(sbi); - return ret; } static int ext4_load_and_init_journal(struct super_block *sb, @@ -5086,6 +5083,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount; sbi->s_def_mount_opt = sbi->s_mount_opt; + sbi->s_def_mount_opt2 = sbi->s_mount_opt2; err = ext4_check_opt_consistency(fc, sb); if (err < 0) @@ -5220,14 +5218,14 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_geometry_check(sb, es)) goto failed_mount; - err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); - if (err) - goto failed_mount; - timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); + err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); + if (err) + goto failed_mount3; + /* Register extent status tree shrinker */ if (ext4_es_register_shrinker(sbi)) goto failed_mount3; @@ -5336,11 +5334,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) } } - if (ext4_has_feature_verity(sb) && sb->s_blocksize != PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); - goto failed_mount_wq; - } - /* * Get the # of file system overhead blocks from the * superblock if present. @@ -5953,8 +5946,11 @@ static int ext4_load_journal(struct super_block *sb, if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); - - /* Make sure we flush the recovery flag to disk. */ + ext4_commit_super(sb); + } + if (!really_read_only && journal_inum && + journal_inum != le32_to_cpu(es->s_journal_inum)) { + es->s_journal_inum = cpu_to_le32(journal_inum); ext4_commit_super(sb); } diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 3d3ed3c38f56..75bf1f88843c 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -55,12 +55,12 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, return paddr; } -static int ext4_encrypted_symlink_getattr(struct user_namespace *mnt_userns, +static int ext4_encrypted_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - ext4_getattr(mnt_userns, path, stat, request_mask, query_flags); + ext4_getattr(idmap, path, stat, request_mask, query_flags); return fscrypt_symlink_getattr(path, stat); } diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index d233c24ea342..e2b8b3437c58 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -491,6 +491,11 @@ static void ext4_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +static void ext4_feat_release(struct kobject *kobj) +{ + kfree(kobj); +} + static const struct sysfs_ops ext4_attr_ops = { .show = ext4_attr_show, .store = ext4_attr_store, @@ -505,7 +510,7 @@ static struct kobj_type ext4_sb_ktype = { static struct kobj_type ext4_feat_ktype = { .default_groups = ext4_feat_groups, .sysfs_ops = &ext4_attr_ops, - .release = (void (*)(struct kobject *))kfree, + .release = ext4_feat_release, }; void ext4_notify_error_sysfs(struct ext4_sb_info *sbi) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 30e3b65798b5..e4da1704438e 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -381,11 +381,11 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, } static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, - u64 index, int log_blocksize) + u64 pos, unsigned int size) { - loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize); + pos += ext4_verity_metadata_pos(inode); - return pagecache_write(inode, buf, 1 << log_blocksize, pos); + return pagecache_write(inode, buf, size, pos); } const struct fsverity_operations ext4_verityops = { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 69a1b8c6a2ec..62f2ec599218 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -184,27 +184,73 @@ ext4_xattr_handler(int name_index) } static int -ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, - void *value_start) +check_xattrs(struct inode *inode, struct buffer_head *bh, + struct ext4_xattr_entry *entry, void *end, void *value_start, + const char *function, unsigned int line) { struct ext4_xattr_entry *e = entry; + int err = -EFSCORRUPTED; + char *err_str; + + if (bh) { + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) { + err_str = "invalid header"; + goto errout; + } + if (buffer_verified(bh)) + return 0; + if (!ext4_xattr_block_csum_verify(inode, bh)) { + err = -EFSBADCRC; + err_str = "invalid checksum"; + goto errout; + } + } else { + struct ext4_xattr_ibody_header *header = value_start; + + header -= 1; + if (end - (void *)header < sizeof(*header) + sizeof(u32)) { + err_str = "in-inode xattr block too small"; + goto errout; + } + if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + err_str = "bad magic number in in-inode xattr"; + goto errout; + } + } /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); - if ((void *)next >= end) - return -EFSCORRUPTED; - if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) - return -EFSCORRUPTED; + if ((void *)next >= end) { + err_str = "e_name out of bounds"; + goto errout; + } + if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) { + err_str = "bad e_name length"; + goto errout; + } e = next; } /* Check the values */ while (!IS_LAST_ENTRY(entry)) { u32 size = le32_to_cpu(entry->e_value_size); + unsigned long ea_ino = le32_to_cpu(entry->e_value_inum); - if (size > EXT4_XATTR_SIZE_MAX) - return -EFSCORRUPTED; + if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) { + err_str = "ea_inode specified without ea_inode feature enabled"; + goto errout; + } + if (ea_ino && ((ea_ino == EXT4_ROOT_INO) || + !ext4_valid_inum(inode->i_sb, ea_ino))) { + err_str = "invalid ea_ino"; + goto errout; + } + if (size > EXT4_XATTR_SIZE_MAX) { + err_str = "e_value size too large"; + goto errout; + } if (size != 0 && entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); @@ -216,66 +262,54 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, * the padded and unpadded sizes, since the size may * overflow to 0 when adding padding. */ - if (offs > end - value_start) - return -EFSCORRUPTED; + if (offs > end - value_start) { + err_str = "e_value out of bounds"; + goto errout; + } value = value_start + offs; if (value < (void *)e + sizeof(u32) || size > end - value || - EXT4_XATTR_SIZE(size) > end - value) - return -EFSCORRUPTED; + EXT4_XATTR_SIZE(size) > end - value) { + err_str = "overlapping e_value "; + goto errout; + } } entry = EXT4_XATTR_NEXT(entry); } - + if (bh) + set_buffer_verified(bh); return 0; + +errout: + if (bh) + __ext4_error_inode(inode, function, line, 0, -err, + "corrupted xattr block %llu: %s", + (unsigned long long) bh->b_blocknr, + err_str); + else + __ext4_error_inode(inode, function, line, 0, -err, + "corrupted in-inode xattr: %s", err_str); + return err; } static inline int __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, const char *function, unsigned int line) { - int error = -EFSCORRUPTED; - - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) - goto errout; - if (buffer_verified(bh)) - return 0; - - error = -EFSBADCRC; - if (!ext4_xattr_block_csum_verify(inode, bh)) - goto errout; - error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, - bh->b_data); -errout: - if (error) - __ext4_error_inode(inode, function, line, 0, -error, - "corrupted xattr block %llu", - (unsigned long long) bh->b_blocknr); - else - set_buffer_verified(bh); - return error; + return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size, + bh->b_data, function, line); } #define ext4_xattr_check_block(inode, bh) \ __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) -static int +static inline int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { - int error = -EFSCORRUPTED; - - if (end - (void *)header < sizeof(*header) + sizeof(u32) || - (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) - goto errout; - error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); -errout: - if (error) - __ext4_error_inode(inode, function, line, 0, -error, - "corrupted in-inode xattr"); - return error; + return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header), + function, line); } #define xattr_check_inode(inode, header, end) \ @@ -388,6 +422,17 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, struct inode *inode; int err; + /* + * We have to check for this corruption early as otherwise + * iget_locked() could wait indefinitely for the state of our + * parent inode. + */ + if (parent->i_ino == ea_ino) { + ext4_error(parent->i_sb, + "Parent and EA inode have the same ino %lu", ea_ino); + return -EFSCORRUPTED; + } + inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -482,11 +527,12 @@ ext4_xattr_inode_verify_hashes(struct inode *ea_inode, */ e_hash = ext4_xattr_hash_entry_signed(entry->e_name, entry->e_name_len, &tmp_data, 1); - if (e_hash == entry->e_hash) - return 0; - /* Still no match - bad */ - return -EFSCORRUPTED; + if (e_hash != entry->e_hash) + return -EFSCORRUPTED; + + /* Let people know about old hash */ + pr_warn_once("ext4: filesystem with signed xattr name hash"); } return 0; } @@ -1437,6 +1483,13 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; int err; + if (inode->i_sb->s_root == NULL) { + ext4_warning(inode->i_sb, + "refuse to create EA inode when umounting"); + WARN_ON(1); + return ERR_PTR(-EINVAL); + } + /* * Let the next inode be the goal, so we try and allocate the EA inode * in the same group, or nearby one. @@ -2566,9 +2619,8 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); - buffer = kvmalloc(value_size, GFP_NOFS); b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); - if (!is || !bs || !buffer || !b_entry_name) { + if (!is || !bs || !b_entry_name) { error = -ENOMEM; goto out; } @@ -2580,12 +2632,18 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, /* Save the entry name and the entry value */ if (entry->e_value_inum) { + buffer = kvmalloc(value_size, GFP_NOFS); + if (!buffer) { + error = -ENOMEM; + goto out; + } + error = ext4_xattr_inode_get(inode, entry, buffer, value_size); if (error) goto out; } else { size_t value_offs = le16_to_cpu(entry->e_value_offs); - memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + buffer = (void *)IFIRST(header) + value_offs; } memcpy(b_entry_name, entry->e_name, entry->e_name_len); @@ -2600,25 +2658,26 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, if (error) goto out; - /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(handle, inode, &i, is); - if (error) - goto out; - i.value = buffer; i.value_len = value_size; error = ext4_xattr_block_find(inode, &i, bs); if (error) goto out; - /* Add entry which was removed from the inode into the block */ + /* Move ea entry from the inode into the block */ error = ext4_xattr_block_set(handle, inode, &i, bs); if (error) goto out; - error = 0; + + /* Remove the chosen entry from the inode */ + i.value = NULL; + i.value_len = 0; + error = ext4_xattr_ibody_set(handle, inode, &i, is); + out: kfree(b_entry_name); - kvfree(buffer); + if (entry->e_value_inum && buffer) + kvfree(buffer); if (is) brelse(is->iloc.bh); if (bs) @@ -3096,7 +3155,7 @@ static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ - *name++; + (unsigned char)*name++; } while (value_count--) { hash = (hash << VALUE_HASH_SHIFT) ^ diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c index c78df5790377..8a5842e4cd95 100644 --- a/fs/ext4/xattr_hurd.c +++ b/fs/ext4/xattr_hurd.c @@ -32,7 +32,7 @@ ext4_xattr_hurd_get(const struct xattr_handler *handler, static int ext4_xattr_hurd_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 8213f66f7b2d..776cf11d24ca 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -23,7 +23,7 @@ ext4_xattr_security_get(const struct xattr_handler *handler, static int ext4_xattr_security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 7c21ffb26d25..9811eb0ab276 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -30,7 +30,7 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler, static int ext4_xattr_trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 2fe7ff0a479c..4b70bf4e7626 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -31,7 +31,7 @@ ext4_xattr_user_get(const struct xattr_handler *handler, static int ext4_xattr_user_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index c1c74aa658ae..ec2aeccb69a3 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -204,7 +204,7 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu) return __f2fs_get_acl(inode, type, NULL); } -static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, +static int f2fs_acl_update_mode(struct mnt_idmap *idmap, struct inode *inode, umode_t *mode_p, struct posix_acl **acl) { @@ -219,14 +219,14 @@ static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) && + !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; return 0; } -static int __f2fs_set_acl(struct user_namespace *mnt_userns, +static int __f2fs_set_acl(struct mnt_idmap *idmap, struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { @@ -240,7 +240,7 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = f2fs_acl_update_mode(mnt_userns, inode, + error = f2fs_acl_update_mode(idmap, inode, &mode, &acl); if (error) return error; @@ -276,7 +276,7 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns, return error; } -int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int f2fs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct inode *inode = d_inode(dentry); @@ -284,7 +284,7 @@ int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; - return __f2fs_set_acl(mnt_userns, inode, type, acl, NULL); + return __f2fs_set_acl(idmap, inode, type, acl, NULL); } /* diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index ea2bbb3f264b..94ebfbfbdc6f 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -34,7 +34,7 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool); -extern int f2fs_set_acl(struct user_namespace *, struct dentry *, +extern int f2fs_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 56f7d0d6a8b2..c3e058e0a018 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -70,7 +70,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, - .is_por = !is_meta, + .is_por = !is_meta ? 1 : 0, }; int err; @@ -171,10 +171,8 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { - if (time_to_inject(sbi, FAULT_BLKADDR)) { - f2fs_show_injection_info(sbi, FAULT_BLKADDR); + if (time_to_inject(sbi, FAULT_BLKADDR)) return false; - } switch (type) { case META_NAT: @@ -239,8 +237,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .op = REQ_OP_READ, .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, - .in_list = false, - .is_por = (type == META_POR), + .in_list = 0, + .is_por = (type == META_POR) ? 1 : 0, }; struct blk_plug plug; int err; @@ -395,59 +393,62 @@ long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, prev = ULONG_MAX; - struct pagevec pvec; + struct folio_batch fbatch; long nwritten = 0; - int nr_pages; + int nr_folios; struct writeback_control wbc = { .for_reclaim = 0, }; struct blk_plug plug; - pagevec_init(&pvec); + folio_batch_init(&fbatch); blk_start_plug(&plug); - while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(mapping, &index, + (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - if (prev == ULONG_MAX) - prev = page->index - 1; - if (nr_to_write != LONG_MAX && page->index != prev + 1) { - pagevec_release(&pvec); + if (nr_to_write != LONG_MAX && i != 0 && + folio->index != prev + + folio_nr_pages(fbatch.folios[i-1])) { + folio_batch_release(&fbatch); goto stop; } - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - f2fs_wait_on_page_writeback(page, META, true, true); + f2fs_wait_on_page_writeback(&folio->page, META, + true, true); - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - if (__f2fs_write_meta_page(page, &wbc, io_type)) { - unlock_page(page); + if (__f2fs_write_meta_page(&folio->page, &wbc, + io_type)) { + folio_unlock(folio); break; } - nwritten++; - prev = page->index; + nwritten += folio_nr_pages(folio); + prev = folio->index; if (unlikely(nwritten >= nr_to_write)) break; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } stop: @@ -622,7 +623,6 @@ int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); - f2fs_show_injection_info(sbi, FAULT_ORPHAN); return -ENOSPC; } @@ -795,7 +795,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) */ head = &im->ino_list; - /* loop for each orphan inode entry and write them in Jornal block */ + /* loop for each orphan inode entry and write them in journal block */ list_for_each_entry(orphan, head, list) { if (!page) { page = f2fs_grab_meta_page(sbi, start_blk++); @@ -1125,7 +1125,7 @@ retry: } else { /* * We should submit bio, since it exists several - * wribacking dentry pages in the freeing inode. + * writebacking dentry pages in the freeing inode. */ f2fs_submit_merged_write(sbi, DATA); cond_resched(); @@ -1403,7 +1403,7 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, }; /* - * pagevec_lookup_tag and lock_page again will take + * filemap_get_folios_tag and lock_page again will take * some extra time. Therefore, f2fs_update_meta_pages and * f2fs_sync_meta_pages are combined in this function. */ @@ -1473,20 +1473,18 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { - ckpt->cur_node_segno[i] = - cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE)); - ckpt->cur_node_blkoff[i] = - cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE)); - ckpt->alloc_type[i + CURSEG_HOT_NODE] = - curseg_alloc_type(sbi, i + CURSEG_HOT_NODE); + struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_NODE); + + ckpt->cur_node_segno[i] = cpu_to_le32(curseg->segno); + ckpt->cur_node_blkoff[i] = cpu_to_le16(curseg->next_blkoff); + ckpt->alloc_type[i + CURSEG_HOT_NODE] = curseg->alloc_type; } for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { - ckpt->cur_data_segno[i] = - cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA)); - ckpt->cur_data_blkoff[i] = - cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA)); - ckpt->alloc_type[i + CURSEG_HOT_DATA] = - curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); + struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_DATA); + + ckpt->cur_data_segno[i] = cpu_to_le32(curseg->segno); + ckpt->cur_data_blkoff[i] = cpu_to_le16(curseg->next_blkoff); + ckpt->alloc_type[i + CURSEG_HOT_DATA] = curseg->alloc_type; } /* 2 cp + n data seg summary + orphan inode blocks */ diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 2532f369cb10..b40dec3d7f79 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -241,7 +241,7 @@ static int lz4_init_compress_ctx(struct compress_ctx *cc) unsigned int size = LZ4_MEM_COMPRESS; #ifdef CONFIG_F2FS_FS_LZ4HC - if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET) + if (F2FS_I(cc->inode)->i_compress_level) size = LZ4HC_MEM_COMPRESS; #endif @@ -267,8 +267,7 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) #ifdef CONFIG_F2FS_FS_LZ4HC static int lz4hc_compress_pages(struct compress_ctx *cc) { - unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> - COMPRESS_LEVEL_OFFSET; + unsigned char level = F2FS_I(cc->inode)->i_compress_level; int len; if (level) @@ -340,8 +339,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) zstd_cstream *stream; void *workspace; unsigned int workspace_size; - unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> - COMPRESS_LEVEL_OFFSET; + unsigned char level = F2FS_I(cc->inode)->i_compress_level; if (!level) level = F2FS_ZSTD_DEFAULT_CLEVEL; @@ -564,7 +562,7 @@ module_param(num_compress_pages, uint, 0444); MODULE_PARM_DESC(num_compress_pages, "Number of intermediate compress pages to preallocate"); -int f2fs_init_compress_mempool(void) +int __init f2fs_init_compress_mempool(void) { compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); return compress_page_pool ? 0 : -ENOMEM; @@ -690,9 +688,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) vm_unmap_ram(cc->cbuf, cc->nr_cpages); vm_unmap_ram(cc->rbuf, cc->cluster_size); - for (i = 0; i < cc->nr_cpages; i++) { - if (i < new_nr_cpages) - continue; + for (i = new_nr_cpages; i < cc->nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -1070,7 +1066,7 @@ retry: if (ret) goto out; if (bio) - f2fs_submit_bio(sbi, bio, DATA); + f2fs_submit_read_bio(sbi, bio, DATA); ret = f2fs_init_compress_ctx(cc); if (ret) @@ -1215,10 +1211,11 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, .page = NULL, .encrypted_page = NULL, .compressed_page = NULL, - .submitted = false, + .submitted = 0, .io_type = io_type, .io_wbc = wbc, - .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode), + .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode) ? + 1 : 0, }; struct dnode_of_data dn; struct node_info ni; @@ -1228,7 +1225,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, loff_t psize; int i, err; - /* we should bypass data pages to proceed the kworkder jobs */ + /* we should bypass data pages to proceed the kworker jobs */ if (unlikely(f2fs_cp_error(sbi))) { mapping_set_error(cc->rpages[0]->mapping, -EIO); goto out_free; @@ -1813,6 +1810,7 @@ unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) const struct address_space_operations f2fs_compress_aops = { .release_folio = f2fs_release_folio, .invalidate_folio = f2fs_invalidate_folio, + .migrate_folio = filemap_migrate_folio, }; struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 97e816590cd9..06b552a0aba2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -292,13 +292,11 @@ static void f2fs_read_end_io(struct bio *bio) struct bio_post_read_ctx *ctx; bool intask = in_task(); - iostat_update_and_unbind_ctx(bio, 0); + iostat_update_and_unbind_ctx(bio); ctx = bio->bi_private; - if (time_to_inject(sbi, FAULT_READ_IO)) { - f2fs_show_injection_info(sbi, FAULT_READ_IO); + if (time_to_inject(sbi, FAULT_READ_IO)) bio->bi_status = BLK_STS_IOERR; - } if (bio->bi_status) { f2fs_finish_read_bio(bio, intask); @@ -332,13 +330,11 @@ static void f2fs_write_end_io(struct bio *bio) struct bio_vec *bvec; struct bvec_iter_all iter_all; - iostat_update_and_unbind_ctx(bio, 1); + iostat_update_and_unbind_ctx(bio); sbi = bio->bi_private; - if (time_to_inject(sbi, FAULT_WRITE_IO)) { - f2fs_show_injection_info(sbi, FAULT_WRITE_IO); + if (time_to_inject(sbi, FAULT_WRITE_IO)) bio->bi_status = BLK_STS_IOERR; - } bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; @@ -507,65 +503,66 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, return fscrypt_mergeable_bio(bio, inode, next_idx); } -static inline void __submit_bio(struct f2fs_sb_info *sbi, - struct bio *bio, enum page_type type) +void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, + enum page_type type) { - if (!is_read_io(bio_op(bio))) { - unsigned int start; + WARN_ON_ONCE(!is_read_io(bio_op(bio))); + trace_f2fs_submit_read_bio(sbi->sb, type, bio); - if (type != DATA && type != NODE) - goto submit_io; + iostat_update_submit_ctx(bio, type); + submit_bio(bio); +} - if (f2fs_lfs_mode(sbi) && current->plug) - blk_finish_plug(current->plug); +static void f2fs_align_write_bio(struct f2fs_sb_info *sbi, struct bio *bio) +{ + unsigned int start = + (bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS) % F2FS_IO_SIZE(sbi); + + if (start == 0) + return; - if (!F2FS_IO_ALIGNED(sbi)) - goto submit_io; + /* fill dummy pages */ + for (; start < F2FS_IO_SIZE(sbi); start++) { + struct page *page = + mempool_alloc(sbi->write_io_dummy, + GFP_NOIO | __GFP_NOFAIL); + f2fs_bug_on(sbi, !page); - start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; - start %= F2FS_IO_SIZE(sbi); + lock_page(page); - if (start == 0) - goto submit_io; + zero_user_segment(page, 0, PAGE_SIZE); + set_page_private_dummy(page); - /* fill dummy pages */ - for (; start < F2FS_IO_SIZE(sbi); start++) { - struct page *page = - mempool_alloc(sbi->write_io_dummy, - GFP_NOIO | __GFP_NOFAIL); - f2fs_bug_on(sbi, !page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + f2fs_bug_on(sbi, 1); + } +} - lock_page(page); +static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, + enum page_type type) +{ + WARN_ON_ONCE(is_read_io(bio_op(bio))); - zero_user_segment(page, 0, PAGE_SIZE); - set_page_private_dummy(page); + if (type == DATA || type == NODE) { + if (f2fs_lfs_mode(sbi) && current->plug) + blk_finish_plug(current->plug); - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) - f2fs_bug_on(sbi, 1); + if (F2FS_IO_ALIGNED(sbi)) { + f2fs_align_write_bio(sbi, bio); + /* + * In the NODE case, we lose next block address chain. + * So, we need to do checkpoint in f2fs_sync_file. + */ + if (type == NODE) + set_sbi_flag(sbi, SBI_NEED_CP); } - /* - * In the NODE case, we lose next block address chain. So, we - * need to do checkpoint in f2fs_sync_file. - */ - if (type == NODE) - set_sbi_flag(sbi, SBI_NEED_CP); } -submit_io: - if (is_read_io(bio_op(bio))) - trace_f2fs_submit_read_bio(sbi->sb, type, bio); - else - trace_f2fs_submit_write_bio(sbi->sb, type, bio); + trace_f2fs_submit_write_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); submit_bio(bio); } -void f2fs_submit_bio(struct f2fs_sb_info *sbi, - struct bio *bio, enum page_type type) -{ - __submit_bio(sbi, bio, type); -} - static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -573,12 +570,13 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->op)) + if (is_read_io(fio->op)) { trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); - else + f2fs_submit_read_bio(io->sbi, io->bio, fio->type); + } else { trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); - - __submit_bio(io->sbi, io->bio, fio->type); + f2fs_submit_write_bio(io->sbi, io->bio, fio->type); + } io->bio = NULL; } @@ -655,6 +653,9 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, f2fs_down_write(&io->io_rwsem); + if (!io->bio) + goto unlock_out; + /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; @@ -663,6 +664,7 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); +unlock_out: f2fs_up_write(&io->io_rwsem); } @@ -741,12 +743,15 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) } if (fio->io_wbc && !is_read_io(fio->op)) - wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page) : WB_DATA_TYPE(fio->page)); - __submit_bio(fio->sbi, bio, fio->type); + if (is_read_io(bio_op(bio))) + f2fs_submit_read_bio(fio->sbi, bio, fio->type); + else + f2fs_submit_write_bio(fio->sbi, bio, fio->type); return 0; } @@ -848,7 +853,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, /* page can't be merged into bio; submit the bio */ del_bio_entry(be); - __submit_bio(sbi, *bio, DATA); + f2fs_submit_write_bio(sbi, *bio, DATA); break; } f2fs_up_write(&io->bio_list_lock); @@ -911,7 +916,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, } if (found) - __submit_bio(sbi, target, DATA); + f2fs_submit_write_bio(sbi, target, DATA); if (bio && *bio) { bio_put(*bio); *bio = NULL; @@ -948,7 +953,7 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); inc_page_count(fio->sbi, WB_DATA_TYPE(page)); @@ -991,7 +996,7 @@ next: bio_page = fio->page; /* set submitted = true as a return value */ - fio->submitted = true; + fio->submitted = 1; inc_page_count(sbi, WB_DATA_TYPE(bio_page)); @@ -1007,7 +1012,7 @@ alloc_new: (fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { dec_page_count(sbi, WB_DATA_TYPE(bio_page)); - fio->retry = true; + fio->retry = 1; goto skip; } io->bio = __bio_alloc(fio, BIO_MAX_VECS); @@ -1022,7 +1027,7 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; @@ -1107,7 +1112,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, } inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); - __submit_bio(sbi, bio, DATA); + f2fs_submit_read_bio(sbi, bio, DATA); return 0; } @@ -1207,19 +1212,6 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) -{ - struct extent_info ei = {0, }; - struct inode *inode = dn->inode; - - if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { - dn->data_blkaddr = ei.blk + index - ei.fofs; - return 0; - } - - return f2fs_reserve_block(dn, index); -} - struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs) @@ -1227,15 +1219,14 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0, }; int err; page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) return ERR_PTR(-ENOMEM); - if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { - dn.data_blkaddr = ei.blk + index - ei.fofs; + if (f2fs_lookup_read_extent_cache_block(inode, index, + &dn.data_blkaddr)) { if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; @@ -1432,13 +1423,12 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) return err; dn->data_blkaddr = f2fs_data_blkaddr(dn); - if (dn->data_blkaddr != NULL_ADDR) - goto alloc; - - if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) - return err; + if (dn->data_blkaddr == NULL_ADDR) { + err = inc_valid_block_count(sbi, dn->inode, &count); + if (unlikely(err)) + return err; + } -alloc: set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, @@ -1452,19 +1442,91 @@ alloc: return 0; } -void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +static void f2fs_map_lock(struct f2fs_sb_info *sbi, int flag) { - if (flag == F2FS_GET_BLOCK_PRE_AIO) { - if (lock) - f2fs_down_read(&sbi->node_change); - else - f2fs_up_read(&sbi->node_change); + if (flag == F2FS_GET_BLOCK_PRE_AIO) + f2fs_down_read(&sbi->node_change); + else + f2fs_lock_op(sbi); +} + +static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) + f2fs_up_read(&sbi->node_change); + else + f2fs_unlock_op(sbi); +} + +int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err = 0; + + f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + if (!f2fs_lookup_read_extent_cache_block(dn->inode, index, + &dn->data_blkaddr)) + err = f2fs_reserve_block(dn, index); + f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); + + return err; +} + +static int f2fs_map_no_dnode(struct inode *inode, + struct f2fs_map_blocks *map, struct dnode_of_data *dn, + pgoff_t pgoff) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* + * There is one exceptional case that read_node_page() may return + * -ENOENT due to filesystem has been shutdown or cp_error, return + * -EIO in that case. + */ + if (map->m_may_create && + (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || f2fs_cp_error(sbi))) + return -EIO; + + if (map->m_next_pgofs) + *map->m_next_pgofs = f2fs_get_next_page_offset(dn, pgoff); + if (map->m_next_extent) + *map->m_next_extent = f2fs_get_next_page_offset(dn, pgoff); + return 0; +} + +static bool f2fs_map_blocks_cached(struct inode *inode, + struct f2fs_map_blocks *map, int flag) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int maxblocks = map->m_len; + pgoff_t pgoff = (pgoff_t)map->m_lblk; + struct extent_info ei = {}; + + if (!f2fs_lookup_read_extent_cache(inode, pgoff, &ei)) + return false; + + map->m_pblk = ei.blk + pgoff - ei.fofs; + map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgoff); + map->m_flags = F2FS_MAP_MAPPED; + if (map->m_next_extent) + *map->m_next_extent = pgoff + map->m_len; + + /* for hardware encryption, but to avoid potential issue in future */ + if (flag == F2FS_GET_BLOCK_DIO) + f2fs_wait_on_block_writeback_range(inode, + map->m_pblk, map->m_len); + + if (f2fs_allow_multi_device_dio(sbi, flag)) { + int bidx = f2fs_target_device_index(sbi, map->m_pblk); + struct f2fs_dev_info *dev = &sbi->devs[bidx]; + + map->m_bdev = dev->bdev; + map->m_pblk -= dev->start_blk; + map->m_len = min(map->m_len, dev->end_blk + 1 - map->m_pblk); } else { - if (lock) - f2fs_lock_op(sbi); - else - f2fs_unlock_op(sbi); + map->m_bdev = inode->i_sb->s_bdev; } + return true; } /* @@ -1472,8 +1534,7 @@ void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) * maps continuous logical blocks to physical blocks, and return such * info via f2fs_map_blocks structure. */ -int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, - int create, int flag) +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; @@ -1483,14 +1544,17 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei = {0, }; block_t blkaddr; unsigned int start_pgofs; int bidx = 0; + bool is_hole; if (!maxblocks) return 0; + if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag)) + goto out; + map->m_bdev = inode->i_sb->s_bdev; map->m_multidev_dio = f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag); @@ -1502,42 +1566,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; - if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) { - if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && - map->m_may_create) - goto next_dnode; - - map->m_pblk = ei.blk + pgofs - ei.fofs; - map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); - map->m_flags = F2FS_MAP_MAPPED; - if (map->m_next_extent) - *map->m_next_extent = pgofs + map->m_len; - - /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO) - f2fs_wait_on_block_writeback_range(inode, - map->m_pblk, map->m_len); - - if (map->m_multidev_dio) { - block_t blk_addr = map->m_pblk; - - bidx = f2fs_target_device_index(sbi, map->m_pblk); - - map->m_bdev = FDEV(bidx).bdev; - map->m_pblk -= FDEV(bidx).start_blk; - map->m_len = min(map->m_len, - FDEV(bidx).end_blk + 1 - map->m_pblk); - - if (map->m_may_create) - f2fs_update_device_state(sbi, inode->i_ino, - blk_addr, map->m_len); - } - goto out; - } - next_dnode: if (map->m_may_create) - f2fs_do_map_lock(sbi, flag, true); + f2fs_map_lock(sbi, flag); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -1545,29 +1576,8 @@ next_dnode: if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; - - if (err == -ENOENT) { - /* - * There is one exceptional case that read_node_page() - * may return -ENOENT due to filesystem has been - * shutdown or cp_error, so force to convert error - * number to EIO for such case. - */ - if (map->m_may_create && - (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || - f2fs_cp_error(sbi))) { - err = -EIO; - goto unlock_out; - } - - err = 0; - if (map->m_next_pgofs) - *map->m_next_pgofs = - f2fs_get_next_page_offset(&dn, pgofs); - if (map->m_next_extent) - *map->m_next_extent = - f2fs_get_next_page_offset(&dn, pgofs); - } + if (err == -ENOENT) + err = f2fs_map_no_dnode(inode, map, &dn, pgofs); goto unlock_out; } @@ -1578,78 +1588,76 @@ next_dnode: next_block: blkaddr = f2fs_data_blkaddr(&dn); - - if (__is_valid_data_blkaddr(blkaddr) && - !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { + is_hole = !__is_valid_data_blkaddr(blkaddr); + if (!is_hole && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto sync_out; } - if (__is_valid_data_blkaddr(blkaddr)) { - /* use out-place-update for driect IO under LFS mode */ - if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && - map->m_may_create) { + /* use out-place-update for direct IO under LFS mode */ + if (map->m_may_create && + (is_hole || (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO))) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; + } + + switch (flag) { + case F2FS_GET_BLOCK_PRE_AIO: + if (blkaddr == NULL_ADDR) { + prealloc++; + last_ofs_in_node = dn.ofs_in_node; + } + break; + case F2FS_GET_BLOCK_PRE_DIO: + case F2FS_GET_BLOCK_DIO: err = __allocate_data_block(&dn, map->m_seg_type); if (err) goto sync_out; - blkaddr = dn.data_blkaddr; + if (flag == F2FS_GET_BLOCK_PRE_DIO) + file_need_truncate(inode); set_inode_flag(inode, FI_APPEND_WRITE); + break; + default: + WARN_ON_ONCE(1); + err = -EIO; + goto sync_out; } - } else { - if (create) { - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; - goto sync_out; - } - if (flag == F2FS_GET_BLOCK_PRE_AIO) { - if (blkaddr == NULL_ADDR) { - prealloc++; - last_ofs_in_node = dn.ofs_in_node; - } - } else { - WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO && - flag != F2FS_GET_BLOCK_DIO); - err = __allocate_data_block(&dn, - map->m_seg_type); - if (!err) { - if (flag == F2FS_GET_BLOCK_PRE_DIO) - file_need_truncate(inode); - set_inode_flag(inode, FI_APPEND_WRITE); - } - } - if (err) - goto sync_out; + + blkaddr = dn.data_blkaddr; + if (is_hole) map->m_flags |= F2FS_MAP_NEW; - blkaddr = dn.data_blkaddr; - } else { - if (f2fs_compressed_file(inode) && - f2fs_sanity_check_cluster(&dn) && - (flag != F2FS_GET_BLOCK_FIEMAP || - IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { - err = -EFSCORRUPTED; - f2fs_handle_error(sbi, - ERROR_CORRUPTED_CLUSTER); - goto sync_out; - } - if (flag == F2FS_GET_BLOCK_BMAP) { - map->m_pblk = 0; - goto sync_out; - } - if (flag == F2FS_GET_BLOCK_PRECACHE) - goto sync_out; - if (flag == F2FS_GET_BLOCK_FIEMAP && - blkaddr == NULL_ADDR) { - if (map->m_next_pgofs) - *map->m_next_pgofs = pgofs + 1; - goto sync_out; - } - if (flag != F2FS_GET_BLOCK_FIEMAP) { - /* for defragment case */ + } else if (is_hole) { + if (f2fs_compressed_file(inode) && + f2fs_sanity_check_cluster(&dn) && + (flag != F2FS_GET_BLOCK_FIEMAP || + IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_CORRUPTED_CLUSTER); + goto sync_out; + } + + switch (flag) { + case F2FS_GET_BLOCK_PRECACHE: + goto sync_out; + case F2FS_GET_BLOCK_BMAP: + map->m_pblk = 0; + goto sync_out; + case F2FS_GET_BLOCK_FIEMAP: + if (blkaddr == NULL_ADDR) { if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; goto sync_out; } + break; + default: + /* for defragment case */ + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + goto sync_out; } } @@ -1660,9 +1668,9 @@ next_block: bidx = f2fs_target_device_index(sbi, blkaddr); if (map->m_len == 0) { - /* preallocated unwritten block should be mapped for fiemap. */ + /* reserved delalloc block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) - map->m_flags |= F2FS_MAP_UNWRITTEN; + map->m_flags |= F2FS_MAP_DELALLOC; map->m_flags |= F2FS_MAP_MAPPED; map->m_pblk = blkaddr; @@ -1721,7 +1729,7 @@ skip: f2fs_put_dnode(&dn); if (map->m_may_create) { - f2fs_do_map_lock(sbi, flag, false); + f2fs_map_unlock(sbi, flag); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -1767,11 +1775,11 @@ sync_out: f2fs_put_dnode(&dn); unlock_out: if (map->m_may_create) { - f2fs_do_map_lock(sbi, flag, false); + f2fs_map_unlock(sbi, flag); f2fs_balance_fs(sbi, dn.node_changed); } out: - trace_f2fs_map_blocks(inode, map, create, flag, err); + trace_f2fs_map_blocks(inode, map, flag, err); return err; } @@ -1793,7 +1801,7 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) while (map.m_lblk < last_lblk) { map.m_len = last_lblk - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); if (err || map.m_len == 0) return false; map.m_lblk += map.m_len; @@ -1967,7 +1975,7 @@ next: map.m_len = cluster_size - count_in_cluster; } - ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; @@ -1984,7 +1992,7 @@ next: compr_appended = false; /* In a case of compressed cluster, append this to the last extent */ - if (compr_cluster && ((map.m_flags & F2FS_MAP_UNWRITTEN) || + if (compr_cluster && ((map.m_flags & F2FS_MAP_DELALLOC) || !(map.m_flags & F2FS_MAP_FLAGS))) { compr_appended = true; goto skip_fill; @@ -2030,7 +2038,7 @@ skip_fill: compr_cluster = false; size += blks_to_bytes(inode, 1); } - } else if (map.m_flags & F2FS_MAP_UNWRITTEN) { + } else if (map.m_flags & F2FS_MAP_DELALLOC) { flags = FIEMAP_EXTENT_UNWRITTEN; } @@ -2053,8 +2061,7 @@ out: static inline loff_t f2fs_readpage_limit(struct inode *inode) { - if (IS_ENABLED(CONFIG_FS_VERITY) && - (IS_VERITY(inode) || f2fs_verity_in_progress(inode))) + if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); @@ -2100,7 +2107,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, map->m_lblk = block_in_file; map->m_len = last_block - block_in_file; - ret = f2fs_map_blocks(inode, map, 0, F2FS_GET_BLOCK_DEFAULT); + ret = f2fs_map_blocks(inode, map, F2FS_GET_BLOCK_DEFAULT); if (ret) goto out; got_it: @@ -2137,7 +2144,7 @@ zero_out: *last_block_in_bio, block_nr) || !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { submit_and_realloc: - __submit_bio(F2FS_I_SB(inode), bio, DATA); + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { @@ -2284,7 +2291,7 @@ skip_reading_dnode: *last_block_in_bio, blkaddr) || !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { submit_and_realloc: - __submit_bio(sbi, bio, DATA); + f2fs_submit_read_bio(sbi, bio, DATA); bio = NULL; } @@ -2378,7 +2385,7 @@ static int f2fs_mpage_readpages(struct inode *inode, #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { - /* there are remained comressed pages, submit them */ + /* there are remained compressed pages, submit them */ if (!f2fs_cluster_can_merge_page(&cc, page->index)) { ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, @@ -2445,7 +2452,7 @@ next_page: #endif } if (bio) - __submit_bio(F2FS_I_SB(inode), bio, DATA); + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); return ret; } @@ -2531,34 +2538,29 @@ static inline bool check_inplace_update_policy(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - unsigned int policy = SM_I(sbi)->ipu_policy; - if (policy & (0x1 << F2FS_IPU_HONOR_OPU_WRITE) && - is_inode_flag_set(inode, FI_OPU_WRITE)) + if (IS_F2FS_IPU_HONOR_OPU_WRITE(sbi) && + is_inode_flag_set(inode, FI_OPU_WRITE)) return false; - if (policy & (0x1 << F2FS_IPU_FORCE)) + if (IS_F2FS_IPU_FORCE(sbi)) return true; - if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi)) + if (IS_F2FS_IPU_SSR(sbi) && f2fs_need_SSR(sbi)) return true; - if (policy & (0x1 << F2FS_IPU_UTIL) && - utilization(sbi) > SM_I(sbi)->min_ipu_util) + if (IS_F2FS_IPU_UTIL(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; - if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && f2fs_need_SSR(sbi) && - utilization(sbi) > SM_I(sbi)->min_ipu_util) + if (IS_F2FS_IPU_SSR_UTIL(sbi) && f2fs_need_SSR(sbi) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; /* * IPU for rewrite async pages */ - if (policy & (0x1 << F2FS_IPU_ASYNC) && - fio && fio->op == REQ_OP_WRITE && - !(fio->op_flags & REQ_SYNC) && - !IS_ENCRYPTED(inode)) + if (IS_F2FS_IPU_ASYNC(sbi) && fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && !IS_ENCRYPTED(inode)) return true; /* this is only set during fdatasync */ - if (policy & (0x1 << F2FS_IPU_FSYNC) && - is_inode_flag_set(inode, FI_NEED_IPU)) + if (IS_F2FS_IPU_FSYNC(sbi) && is_inode_flag_set(inode, FI_NEED_IPU)) return true; if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) && @@ -2636,7 +2638,6 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; - struct extent_info ei = {0, }; struct node_info ni; bool ipu_force = false; int err = 0; @@ -2648,9 +2649,8 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) set_new_dnode(&dn, inode, NULL, NULL, 0); if (need_inplace_update(fio) && - f2fs_lookup_read_extent_cache(inode, page->index, &ei)) { - fio->old_blkaddr = ei.blk + page->index - ei.fofs; - + f2fs_lookup_read_extent_cache_block(inode, page->index, + &fio->old_blkaddr)) { if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_handle_error(fio->sbi, @@ -2700,7 +2700,6 @@ got_it: goto out_writepage; set_page_writeback(page); - ClearPageError(page); f2fs_put_dnode(&dn); if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); @@ -2736,7 +2735,6 @@ got_it: goto out_writepage; set_page_writeback(page); - ClearPageError(page); if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR) f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false); @@ -2781,10 +2779,10 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, - .submitted = false, + .submitted = 0, .compr_blocks = compr_blocks, .need_lock = LOCK_RETRY, - .post_read = f2fs_post_read_required(inode), + .post_read = f2fs_post_read_required(inode) ? 1 : 0, .io_type = io_type, .io_wbc = wbc, .bio = bio, @@ -2793,7 +2791,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, trace_f2fs_writepage(page, DATA); - /* we should bypass data pages to proceed the kworkder jobs */ + /* we should bypass data pages to proceed the kworker jobs */ if (unlikely(f2fs_cp_error(sbi))) { mapping_set_error(page->mapping, -EIO); /* @@ -2905,14 +2903,14 @@ out: } if (submitted) - *submitted = fio.submitted ? 1 : 0; + *submitted = fio.submitted; return 0; redirty_out: redirty_page_for_writepage(wbc, page); /* - * pageout() in MM traslates EAGAIN, so calls handle_write_error() + * pageout() in MM translates EAGAIN, so calls handle_write_error() * -> mapping_set_error() -> set_bit(AS_EIO, ...). * file_write_and_wait_range() will see EIO error, which is critical * to return value of fsync() followed by atomic_write failure to user. @@ -2946,7 +2944,7 @@ out: } /* - * This function was copied from write_cche_pages from mm/page-writeback.c. + * This function was copied from write_cache_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from * warm/hot data page. */ @@ -2957,6 +2955,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0, retry = 0; struct page *pages[F2FS_ONSTACK_PAGES]; + struct folio_batch fbatch; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; @@ -2977,6 +2976,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, .private = NULL, }; #endif + int nr_folios, p, idx; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ @@ -2987,6 +2987,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int submitted = 0; int i; + folio_batch_init(&fbatch); + if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) set_inode_flag(mapping->host, FI_HOT_DATA); @@ -3012,13 +3014,38 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !retry && (index <= end)) { - nr_pages = find_get_pages_range_tag(mapping, &index, end, - tag, F2FS_ONSTACK_PAGES, pages); - if (nr_pages == 0) + nr_pages = 0; +again: + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) { + if (nr_pages) + goto write; break; + } + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + idx = 0; + p = folio_nr_pages(folio); +add_more: + pages[nr_pages] = folio_page(folio, idx); + folio_get(folio); + if (++nr_pages == F2FS_ONSTACK_PAGES) { + index = folio->index + idx + 1; + folio_batch_release(&fbatch); + goto write; + } + if (++idx < p) + goto add_more; + } + folio_batch_release(&fbatch); + goto again; +write: for (i = 0; i < nr_pages; i++) { struct page *page = pages[i]; + struct folio *folio = page_folio(page); bool need_readd; readd: need_readd = false; @@ -3035,7 +3062,7 @@ readd: } if (!f2fs_cluster_can_merge_page(&cc, - page->index)) { + folio->index)) { ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); if (!ret) @@ -3044,27 +3071,28 @@ readd: } if (unlikely(f2fs_cp_error(sbi))) - goto lock_page; + goto lock_folio; if (!f2fs_cluster_is_empty(&cc)) - goto lock_page; + goto lock_folio; if (f2fs_all_cluster_page_ready(&cc, pages, i, nr_pages, true)) - goto lock_page; + goto lock_folio; ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, - page->index, &fsdata); + folio->index, &fsdata); if (ret2 < 0) { ret = ret2; done = 1; break; } else if (ret2 && (!f2fs_compress_write_end(inode, - fsdata, page->index, 1) || + fsdata, folio->index, 1) || !f2fs_all_cluster_page_ready(&cc, - pages, i, nr_pages, false))) { + pages, i, nr_pages, + false))) { retry = 1; break; } @@ -3077,46 +3105,47 @@ readd: break; } #ifdef CONFIG_F2FS_FS_COMPRESSION -lock_page: +lock_folio: #endif - done_index = page->index; + done_index = folio->index; retry_write: - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) - f2fs_wait_on_page_writeback(page, + f2fs_wait_on_page_writeback( + &folio->page, DATA, true, true); else goto continue_unlock; } - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { - get_page(page); - f2fs_compress_ctx_add_page(&cc, page); + folio_get(folio); + f2fs_compress_ctx_add_page(&cc, &folio->page); continue; } #endif - ret = f2fs_write_single_data_page(page, &submitted, - &bio, &last_block, wbc, io_type, - 0, true); + ret = f2fs_write_single_data_page(&folio->page, + &submitted, &bio, &last_block, + wbc, io_type, 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); + folio_unlock(folio); #ifdef CONFIG_F2FS_FS_COMPRESSION result: #endif @@ -3140,7 +3169,8 @@ result: } goto next; } - done_index = page->index + 1; + done_index = folio->index + + folio_nr_pages(folio); done = 1; break; } @@ -3323,9 +3353,8 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei = {0, }; + int flag = F2FS_GET_BLOCK_PRE_AIO; int err = 0; - int flag; /* * If a whole page is being written and we already preallocated all the @@ -3335,14 +3364,13 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ - if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode)) - flag = F2FS_GET_BLOCK_DEFAULT; - else - flag = F2FS_GET_BLOCK_PRE_AIO; - - if (f2fs_has_inline_data(inode) || - (pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_do_map_lock(sbi, flag, true); + if (f2fs_has_inline_data(inode)) { + if (pos + len > MAX_INLINE_DATA(inode)) + flag = F2FS_GET_BLOCK_DEFAULT; + f2fs_map_lock(sbi, flag); + locked = true; + } else if ((pos & PAGE_MASK) >= i_size_read(inode)) { + f2fs_map_lock(sbi, flag); locked = true; } @@ -3362,40 +3390,40 @@ restart: set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) set_page_private_inline(ipage); - } else { - err = f2fs_convert_inline_page(&dn, page); - if (err) - goto out; - if (dn.data_blkaddr == NULL_ADDR) - err = f2fs_get_block(&dn, index); - } - } else if (locked) { - err = f2fs_get_block(&dn, index); - } else { - if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { - dn.data_blkaddr = ei.blk + index - ei.fofs; - } else { - /* hole case */ - err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err || dn.data_blkaddr == NULL_ADDR) { - f2fs_put_dnode(&dn); - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, - true); - WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); - locked = true; - goto restart; - } + goto out; } + err = f2fs_convert_inline_page(&dn, page); + if (err || dn.data_blkaddr != NULL_ADDR) + goto out; } - /* convert_inline_page can make node_changed */ - *blk_addr = dn.data_blkaddr; - *node_changed = dn.node_changed; + if (!f2fs_lookup_read_extent_cache_block(inode, index, + &dn.data_blkaddr)) { + if (locked) { + err = f2fs_reserve_block(&dn, index); + goto out; + } + + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (!err && dn.data_blkaddr != NULL_ADDR) + goto out; + f2fs_put_dnode(&dn); + f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); + locked = true; + goto restart; + } out: + if (!err) { + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + } f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_do_map_lock(sbi, flag, false); + f2fs_map_unlock(sbi, flag); return err; } @@ -3404,7 +3432,6 @@ static int __find_data_block(struct inode *inode, pgoff_t index, { struct dnode_of_data dn; struct page *ipage; - struct extent_info ei = {0, }; int err = 0; ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); @@ -3413,9 +3440,8 @@ static int __find_data_block(struct inode *inode, pgoff_t index, set_new_dnode(&dn, inode, ipage, ipage, 0); - if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { - dn.data_blkaddr = ei.blk + index - ei.fofs; - } else { + if (!f2fs_lookup_read_extent_cache_block(inode, index, + &dn.data_blkaddr)) { /* hole case */ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { @@ -3436,7 +3462,7 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, struct page *ipage; int err = 0; - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { @@ -3445,14 +3471,16 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, } set_new_dnode(&dn, inode, ipage, ipage, 0); - err = f2fs_get_block(&dn, index); + if (!f2fs_lookup_read_extent_cache_block(dn.inode, index, + &dn.data_blkaddr)) + err = f2fs_reserve_block(&dn, index); *blk_addr = dn.data_blkaddr; *node_changed = dn.node_changed; f2fs_put_dnode(&dn); unlock_out: - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); return err; } @@ -3698,6 +3726,7 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) } } + clear_page_private_reference(&folio->page); clear_page_private_gcing(&folio->page); if (test_opt(sbi, COMPRESS_CACHE) && @@ -3723,6 +3752,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait) clear_page_private_data(&folio->page); } + clear_page_private_reference(&folio->page); clear_page_private_gcing(&folio->page); folio_detach_private(folio); @@ -3804,7 +3834,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) map.m_next_pgofs = NULL; map.m_seg_type = NO_CHECK_TYPE; - if (!f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_BMAP)) + if (!f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_BMAP)) blknr = map.m_pblk; } out: @@ -3912,7 +3942,7 @@ retry: map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; @@ -4137,8 +4167,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (flags & IOMAP_WRITE) map.m_may_create = true; - err = f2fs_map_blocks(inode, &map, flags & IOMAP_WRITE, - F2FS_GET_BLOCK_DIO); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO); if (err) return err; @@ -4151,20 +4180,24 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); - if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) { - iomap->length = blks_to_bytes(inode, map.m_len); - if (map.m_flags & F2FS_MAP_MAPPED) { - iomap->type = IOMAP_MAPPED; - iomap->flags |= IOMAP_F_MERGED; - } else { - iomap->type = IOMAP_UNWRITTEN; - } - if (WARN_ON_ONCE(!__is_valid_data_blkaddr(map.m_pblk))) - return -EINVAL; + /* + * We should never see delalloc or compressed extents here based on + * prior flushing and checks. + */ + if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR)) + return -EINVAL; + if (WARN_ON_ONCE(map.m_pblk == COMPRESS_ADDR)) + return -EINVAL; + if (map.m_pblk != NULL_ADDR) { + iomap->length = blks_to_bytes(inode, map.m_len); + iomap->type = IOMAP_MAPPED; + iomap->flags |= IOMAP_F_MERGED; iomap->bdev = map.m_bdev; iomap->addr = blks_to_bytes(inode, map.m_pblk); } else { + if (flags & IOMAP_WRITE) + return -ENOTBLK; iomap->length = blks_to_bytes(inode, next_pgofs) - iomap->offset; iomap->type = IOMAP_HOLE; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 32af4f0c5735..30a77936e3c5 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -354,6 +354,17 @@ static char *s_flag[] = { [SBI_IS_FREEZING] = " freezefs", }; +static const char *ipu_mode_names[F2FS_IPU_MAX] = { + [F2FS_IPU_FORCE] = "FORCE", + [F2FS_IPU_SSR] = "SSR", + [F2FS_IPU_UTIL] = "UTIL", + [F2FS_IPU_SSR_UTIL] = "SSR_UTIL", + [F2FS_IPU_FSYNC] = "FSYNC", + [F2FS_IPU_ASYNC] = "ASYNC", + [F2FS_IPU_NOCACHE] = "NOCACHE", + [F2FS_IPU_HONOR_OPU_WRITE] = "HONOR_OPU_WRITE", +}; + static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; @@ -362,16 +373,18 @@ static int stat_show(struct seq_file *s, void *v) raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_for_each_entry(si, &f2fs_stat_list, stat_list) { - update_general_status(si->sbi); + struct f2fs_sb_info *sbi = si->sbi; + + update_general_status(sbi); seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", - si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO" : "RW", - is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? - "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); - if (si->sbi->s_flag) { + sbi->sb->s_bdev, i++, + f2fs_readonly(sbi->sb) ? "RO" : "RW", + is_set_ckpt_flags(sbi, CP_DISABLED_FLAG) ? + "Disabled" : (f2fs_cp_error(sbi) ? "Error" : "Good")); + if (sbi->s_flag) { seq_puts(s, "[SBI:"); - for_each_set_bit(j, &si->sbi->s_flag, 32) + for_each_set_bit(j, &sbi->s_flag, 32) seq_puts(s, s_flag[j]); seq_puts(s, "]\n"); } @@ -383,8 +396,21 @@ static int stat_show(struct seq_file *s, void *v) si->overp_segs, si->rsvd_segs); seq_printf(s, "Current Time Sec: %llu / Mounted Time Sec: %llu\n\n", ktime_get_boottime_seconds(), - SIT_I(si->sbi)->mounted_time); - if (test_opt(si->sbi, DISCARD)) + SIT_I(sbi)->mounted_time); + + seq_puts(s, "Policy:\n"); + seq_puts(s, " - IPU: ["); + if (IS_F2FS_IPU_DISABLE(sbi)) { + seq_puts(s, " DISABLE"); + } else { + unsigned long policy = SM_I(sbi)->ipu_policy; + + for_each_set_bit(j, &policy, F2FS_IPU_MAX) + seq_printf(s, " %s", ipu_mode_names[j]); + } + seq_puts(s, " ]\n\n"); + + if (test_opt(sbi, DISCARD)) seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", si->utilization, si->valid_count, si->discard_blks); else @@ -491,15 +517,15 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - node segments : %d (%d)\n", si->node_segs, si->bg_node_segs); seq_puts(s, " - Reclaimed segs :\n"); - seq_printf(s, " - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]); - seq_printf(s, " - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]); + seq_printf(s, " - Normal : %d\n", sbi->gc_reclaimed_segs[GC_NORMAL]); + seq_printf(s, " - Idle CB : %d\n", sbi->gc_reclaimed_segs[GC_IDLE_CB]); seq_printf(s, " - Idle Greedy : %d\n", - si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]); - seq_printf(s, " - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]); + sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]); + seq_printf(s, " - Idle AT : %d\n", sbi->gc_reclaimed_segs[GC_IDLE_AT]); seq_printf(s, " - Urgent High : %d\n", - si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]); - seq_printf(s, " - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]); - seq_printf(s, " - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); + sbi->gc_reclaimed_segs[GC_URGENT_HIGH]); + seq_printf(s, " - Urgent Mid : %d\n", sbi->gc_reclaimed_segs[GC_URGENT_MID]); + seq_printf(s, " - Urgent Low : %d\n", sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, @@ -565,7 +591,7 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - fsync mark: %4lld\n", percpu_counter_sum_positive( - &si->sbi->rf_node_block_count)); + &sbi->rf_node_block_count)); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", @@ -592,12 +618,12 @@ static int stat_show(struct seq_file *s, void *v) si->block_count[LFS], si->segment_count[LFS]); /* segment usage info */ - f2fs_update_sit_info(si->sbi); + f2fs_update_sit_info(sbi); seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n", si->bimodal, si->avg_vblocks); /* memory footprint */ - update_mem_info(si->sbi); + update_mem_info(sbi); seq_printf(s, "\nMemory: %llu KB\n", (si->base_mem + si->cache_mem + si->page_mem) >> 10); seq_printf(s, " - static: %llu KB\n", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8e025157f35c..9ccdbe120425 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -732,10 +732,8 @@ int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, } start: - if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { - f2fs_show_injection_info(F2FS_I_SB(dir), FAULT_DIR_DEPTH); + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) return -ENOSPC; - } if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 342af24b2f8c..28b12553f2b3 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -19,6 +19,31 @@ #include "node.h" #include <trace/events/f2fs.h> +bool sanity_check_extent_cache(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct extent_info *ei; + + if (!fi->extent_tree[EX_READ]) + return true; + + ei = &fi->extent_tree[EX_READ]->largest; + + if (ei->len && + (!f2fs_is_valid_blkaddr(sbi, ei->blk, + DATA_GENERIC_ENHANCE) || + !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, + DATA_GENERIC_ENHANCE))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", + __func__, inode->i_ino, + ei->blk, ei->fofs, ei->len); + return false; + } + return true; +} + static void __set_extent_info(struct extent_info *ei, unsigned int fofs, unsigned int len, block_t blk, bool keep_clen, @@ -233,7 +258,7 @@ struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, * @prev_ex: extent before ofs * @next_ex: extent after ofs * @insert_p: insert point for new extent at ofs - * in order to simpfy the insertion after. + * in order to simplify the insertion after. * tree must stay unchanged between lookup and insertion. */ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, @@ -718,7 +743,7 @@ static void __update_extent_tree_range(struct inode *inode, if (!en) en = next_en; - /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */ + /* 2. invalidate all extent nodes in range [fofs, fofs + len - 1] */ while (en && en->ei.fofs < end) { unsigned int org_end; int parts = 0; /* # of parts current extent split into */ @@ -871,14 +896,23 @@ unlock_out: } #endif -static unsigned long long __calculate_block_age(unsigned long long new, +static unsigned long long __calculate_block_age(struct f2fs_sb_info *sbi, + unsigned long long new, unsigned long long old) { - unsigned long long diff; + unsigned int rem_old, rem_new; + unsigned long long res; + unsigned int weight = sbi->last_age_weight; + + res = div_u64_rem(new, 100, &rem_new) * (100 - weight) + + div_u64_rem(old, 100, &rem_old) * weight; - diff = (new >= old) ? new - (new - old) : new + (old - new); + if (rem_new) + res += rem_new * (100 - weight) / 100; + if (rem_old) + res += rem_old * weight / 100; - return div_u64(diff * LAST_AGE_WEIGHT, 100); + return res; } /* This returns a new age and allocated blocks in ei */ @@ -910,7 +944,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei, cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks; if (tei.age) - ei->age = __calculate_block_age(cur_age, tei.age); + ei->age = __calculate_block_age(sbi, cur_age, tei.age); else ei->age = cur_age; ei->last_blocks = cur_blocks; @@ -1047,6 +1081,17 @@ bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, return __lookup_extent_tree(inode, pgofs, ei, EX_READ); } +bool f2fs_lookup_read_extent_cache_block(struct inode *inode, pgoff_t index, + block_t *blkaddr) +{ + struct extent_info ei = {}; + + if (!f2fs_lookup_read_extent_cache(inode, index, &ei)) + return false; + *blkaddr = ei.blk + index - ei.fofs; + return true; +} + void f2fs_update_read_extent_cache(struct dnode_of_data *dn) { return __update_extent_cache(dn, EX_READ); @@ -1226,6 +1271,7 @@ void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) atomic64_set(&sbi->allocated_data_blocks, 0); sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD; sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD; + sbi->last_age_weight = LAST_AGE_WEIGHT; } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e8953c3dc81a..b0ab2062038a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -402,7 +402,6 @@ struct discard_cmd_control { struct list_head wait_list; /* store on-flushing entries */ struct list_head fstrim_list; /* in-flight discard from fstrim */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ - unsigned int discard_wake; /* to wake up discard thread */ struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ @@ -410,6 +409,7 @@ struct discard_cmd_control { unsigned int min_discard_issue_time; /* min. interval between discard issue */ unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ unsigned int max_discard_issue_time; /* max. interval between discard issue */ + unsigned int discard_io_aware_gran; /* minimum discard granularity not be aware of I/O */ unsigned int discard_urgent_util; /* utilization which issue discard proactively */ unsigned int discard_granularity; /* discard granularity */ unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ @@ -420,6 +420,7 @@ struct discard_cmd_control { atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root_cached root; /* root of discard rb-tree */ bool rbtree_check; /* config for consistence check */ + bool discard_wake; /* to wake up discard thread */ }; /* for the list of fsync inodes, used only during recovery */ @@ -692,15 +693,13 @@ struct extent_tree_info { }; /* - * This structure is taken from ext4_map_blocks. - * - * Note that, however, f2fs uses NEW and MAPPED flags for f2fs_map_blocks(). + * State of block returned by f2fs_map_blocks. */ -#define F2FS_MAP_NEW (1 << BH_New) -#define F2FS_MAP_MAPPED (1 << BH_Mapped) -#define F2FS_MAP_UNWRITTEN (1 << BH_Unwritten) +#define F2FS_MAP_NEW (1U << 0) +#define F2FS_MAP_MAPPED (1U << 1) +#define F2FS_MAP_DELALLOC (1U << 2) #define F2FS_MAP_FLAGS (F2FS_MAP_NEW | F2FS_MAP_MAPPED |\ - F2FS_MAP_UNWRITTEN) + F2FS_MAP_DELALLOC) struct f2fs_map_blocks { struct block_device *m_bdev; /* for multi-device dio */ @@ -870,7 +869,7 @@ struct f2fs_inode_info { unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ - unsigned short i_compress_flag; /* compress flag */ + unsigned char i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ unsigned int atomic_write_cnt; @@ -1193,7 +1192,8 @@ enum iostat_type { FS_META_READ_IO, /* meta read IOs */ /* other */ - FS_DISCARD, /* discard */ + FS_DISCARD_IO, /* discard */ + FS_FLUSH_IO, /* flush */ NR_IO_TYPE, }; @@ -1210,19 +1210,19 @@ struct f2fs_io_info { struct page *encrypted_page; /* encrypted page */ struct page *compressed_page; /* compressed page */ struct list_head list; /* serialize IOs */ - bool submitted; /* indicate IO submission */ - int need_lock; /* indicate we need to lock cp_rwsem */ - bool in_list; /* indicate fio is in io_list */ - bool is_por; /* indicate IO is from recovery or not */ - bool retry; /* need to reallocate block address */ - int compr_blocks; /* # of compressed block addresses */ - bool encrypted; /* indicate file is encrypted */ - bool post_read; /* require post read */ + unsigned int compr_blocks; /* # of compressed block addresses */ + unsigned int need_lock:8; /* indicate we need to lock cp_rwsem */ + unsigned int version:8; /* version of the node */ + unsigned int submitted:1; /* indicate IO submission */ + unsigned int in_list:1; /* indicate fio is in io_list */ + unsigned int is_por:1; /* indicate IO is from recovery or not */ + unsigned int retry:1; /* need to reallocate block address */ + unsigned int encrypted:1; /* indicate file is encrypted */ + unsigned int post_read:1; /* require post read */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ struct bio **bio; /* bio for ipu */ sector_t *last_block; /* last block number in bio */ - unsigned char version; /* version of the node */ }; struct bio_entry { @@ -1384,8 +1384,6 @@ enum { MEMORY_MODE_LOW, /* memory mode for low memry devices */ }; - - static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1396,19 +1394,17 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr); * Layout A: lowest bit should be 1 * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... | * bit 0 PAGE_PRIVATE_NOT_POINTER - * bit 1 PAGE_PRIVATE_ATOMIC_WRITE - * bit 2 PAGE_PRIVATE_DUMMY_WRITE - * bit 3 PAGE_PRIVATE_ONGOING_MIGRATION - * bit 4 PAGE_PRIVATE_INLINE_INODE - * bit 5 PAGE_PRIVATE_REF_RESOURCE - * bit 6- f2fs private data + * bit 1 PAGE_PRIVATE_DUMMY_WRITE + * bit 2 PAGE_PRIVATE_ONGOING_MIGRATION + * bit 3 PAGE_PRIVATE_INLINE_INODE + * bit 4 PAGE_PRIVATE_REF_RESOURCE + * bit 5- f2fs private data * * Layout B: lowest bit should be 0 * page.private is a wrapped pointer. */ enum { PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */ - PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */ PAGE_PRIVATE_DUMMY_WRITE, /* data page for padding aligned IO */ PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */ PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */ @@ -1450,22 +1446,18 @@ static inline void clear_page_private_##name(struct page *page) \ } PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); -PAGE_PRIVATE_GET_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE); PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE); PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE); PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); static inline unsigned long get_page_private_data(struct page *page) @@ -1679,6 +1671,7 @@ struct f2fs_sb_info { /* The threshold used for hot and warm data seperation*/ unsigned int hot_data_age_threshold; unsigned int warm_data_age_threshold; + unsigned int last_age_weight; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -1864,8 +1857,9 @@ struct f2fs_sb_info { #ifdef CONFIG_F2FS_IOSTAT /* For app/fs IO statistics */ spinlock_t iostat_lock; - unsigned long long rw_iostat[NR_IO_TYPE]; - unsigned long long prev_rw_iostat[NR_IO_TYPE]; + unsigned long long iostat_count[NR_IO_TYPE]; + unsigned long long iostat_bytes[NR_IO_TYPE]; + unsigned long long prev_iostat_bytes[NR_IO_TYPE]; bool iostat_enable; unsigned long iostat_next_period; unsigned int iostat_period_ms; @@ -1877,12 +1871,10 @@ struct f2fs_sb_info { }; #ifdef CONFIG_F2FS_FAULT_INJECTION -#define f2fs_show_injection_info(sbi, type) \ - printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \ - KERN_INFO, sbi->sb->s_id, \ - f2fs_fault_name[type], \ - __func__, __builtin_return_address(0)) -static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +#define time_to_inject(sbi, type) __time_to_inject(sbi, type, __func__, \ + __builtin_return_address(0)) +static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type, + const char *func, const char *parent_func) { struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; @@ -1895,12 +1887,14 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); + printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", + KERN_INFO, sbi->sb->s_id, f2fs_fault_name[type], + func, parent_func); return true; } return false; } #else -#define f2fs_show_injection_info(sbi, type) do { } while (0) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { return false; @@ -2233,10 +2227,8 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) { - if (time_to_inject(sbi, FAULT_LOCK_OP)) { - f2fs_show_injection_info(sbi, FAULT_LOCK_OP); + if (time_to_inject(sbi, FAULT_LOCK_OP)) return 0; - } return f2fs_down_read_trylock(&sbi->cp_rwsem); } @@ -2324,7 +2316,6 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, return ret; if (time_to_inject(sbi, FAULT_BLOCK)) { - f2fs_show_injection_info(sbi, FAULT_BLOCK); release = *count; goto release_quota; } @@ -2604,10 +2595,8 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, return err; } - if (time_to_inject(sbi, FAULT_BLOCK)) { - f2fs_show_injection_info(sbi, FAULT_BLOCK); + if (time_to_inject(sbi, FAULT_BLOCK)) goto enospc; - } spin_lock(&sbi->stat_lock); @@ -2731,11 +2720,8 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, if (page) return page; - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { - f2fs_show_injection_info(F2FS_M_SB(mapping), - FAULT_PAGE_ALLOC); + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) return NULL; - } } if (!for_write) @@ -2752,10 +2738,8 @@ static inline struct page *f2fs_pagecache_get_page( struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp_mask) { - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { - f2fs_show_injection_info(F2FS_M_SB(mapping), FAULT_PAGE_GET); + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) return NULL; - } return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); } @@ -2805,10 +2789,8 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, if (nofail) return f2fs_kmem_cache_alloc_nofail(cachep, flags); - if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) { - f2fs_show_injection_info(sbi, FAULT_SLAB_ALLOC); + if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) return NULL; - } return kmem_cache_alloc(cachep, flags); } @@ -3382,10 +3364,8 @@ static inline bool is_dot_dotdot(const u8 *name, size_t len) static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { - if (time_to_inject(sbi, FAULT_KMALLOC)) { - f2fs_show_injection_info(sbi, FAULT_KMALLOC); + if (time_to_inject(sbi, FAULT_KMALLOC)) return NULL; - } return kmalloc(size, flags); } @@ -3399,10 +3379,8 @@ static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { - if (time_to_inject(sbi, FAULT_KVMALLOC)) { - f2fs_show_injection_info(sbi, FAULT_KVMALLOC); + if (time_to_inject(sbi, FAULT_KVMALLOC)) return NULL; - } return kvmalloc(size, flags); } @@ -3469,15 +3447,15 @@ void f2fs_truncate_data_blocks(struct dnode_of_data *dn); int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); -int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_precache_extents(struct inode *inode); int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int f2fs_fileattr_set(struct user_namespace *mnt_userns, +int f2fs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3505,7 +3483,7 @@ void f2fs_handle_failed_inode(struct inode *inode); int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); -int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode **new_inode); /* @@ -3788,8 +3766,8 @@ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); -void f2fs_submit_bio(struct f2fs_sb_info *sbi, - struct bio *bio, enum page_type type); +void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, + enum page_type type); int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, @@ -3808,7 +3786,7 @@ void f2fs_set_data_blkaddr(struct dnode_of_data *dn); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); -int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); +int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); @@ -3819,9 +3797,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); int f2fs_do_write_data_page(struct f2fs_io_info *fio); -void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock); -int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, - int create, int flag); +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); int f2fs_encrypt_one_page(struct f2fs_io_info *fio); @@ -4161,6 +4137,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ +bool sanity_check_extent_cache(struct inode *inode); struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs); struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, @@ -4190,6 +4167,8 @@ void f2fs_destroy_extent_cache(void); void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage); bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, struct extent_info *ei); +bool f2fs_lookup_read_extent_cache_block(struct inode *inode, pgoff_t index, + block_t *blkaddr); void f2fs_update_read_extent_cache(struct dnode_of_data *dn); void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, pgoff_t fofs, block_t blkaddr, unsigned int len); @@ -4259,7 +4238,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); -int f2fs_init_compress_mempool(void); +int __init f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); void f2fs_end_read_compressed_page(struct page *page, bool failed, @@ -4328,7 +4307,7 @@ static inline struct page *f2fs_compress_control_page(struct page *page) WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } -static inline int f2fs_init_compress_mempool(void) { return 0; } +static inline int __init f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { } @@ -4381,9 +4360,8 @@ static inline int set_compress_context(struct inode *inode) if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && F2FS_OPTION(sbi).compress_level) - F2FS_I(inode)->i_compress_flag |= - F2FS_OPTION(sbi).compress_level << - COMPRESS_LEVEL_OFFSET; + F2FS_I(inode)->i_compress_level = + F2FS_OPTION(sbi).compress_level; F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ecbc8c135b49..15dabeac4690 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -113,10 +113,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (need_alloc) { /* block allocation */ - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_block(&dn, page->index); - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + err = f2fs_get_block_locked(&dn, page->index); } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -305,7 +303,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, * for OPU case, during fsync(), node can be persisted before * data when lower device doesn't support write barrier, result * in data corruption after SPO. - * So for strict fsync mode, force to use atomic write sematics + * So for strict fsync mode, force to use atomic write semantics * to keep write order in between data/node and last node to * avoid potential data corruption. */ @@ -619,7 +617,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; f2fs_update_read_extent_cache_range(dn, fofs, 0, len); - f2fs_update_age_extent_cache_range(dn, fofs, nr_free); + f2fs_update_age_extent_cache_range(dn, fofs, len); dec_valid_block_count(sbi, dn->inode, nr_free); } dn->ofs_in_node = ofs; @@ -784,10 +782,8 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); - if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { - f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_TRUNCATE); + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) return -EIO; - } err = f2fs_dquot_initialize(inode); if (err) @@ -837,7 +833,7 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw) return false; } -int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -892,7 +888,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); /* we need to show initial sectors used for inline_data/dentries */ if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || @@ -903,13 +899,13 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, } #ifdef CONFIG_F2FS_FS_POSIX_ACL -static void __setattr_copy(struct user_namespace *mnt_userns, +static void __setattr_copy(struct mnt_idmap *idmap, struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; - i_uid_update(mnt_userns, attr, inode); - i_gid_update(mnt_userns, attr, inode); + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -918,10 +914,10 @@ static void __setattr_copy(struct user_namespace *mnt_userns, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (!vfsgid_in_group_p(vfsgid) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } @@ -930,7 +926,7 @@ static void __setattr_copy(struct user_namespace *mnt_userns, #define __setattr_copy setattr_copy #endif -int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -951,7 +947,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - err = setattr_prepare(mnt_userns, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) return err; @@ -963,15 +959,15 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (err) return err; - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(idmap, inode, attr)) { err = f2fs_dquot_initialize(inode); if (err) return err; } - if (i_uid_needs_update(mnt_userns, attr, inode) || - i_gid_needs_update(mnt_userns, attr, inode)) { + if (i_uid_needs_update(idmap, attr, inode) || + i_gid_needs_update(idmap, attr, inode)) { f2fs_lock_op(F2FS_I_SB(inode)); - err = dquot_transfer(mnt_userns, inode, attr); + err = dquot_transfer(idmap, inode, attr); if (err) { set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); @@ -982,8 +978,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * update uid/gid under lock_op(), so that dquot and inode can * be updated atomically. */ - i_uid_update(mnt_userns, attr, inode); - i_gid_update(mnt_userns, attr, inode); + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); f2fs_unlock_op(F2FS_I_SB(inode)); } @@ -1023,10 +1019,10 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, spin_unlock(&F2FS_I(inode)->i_size_lock); } - __setattr_copy(mnt_userns, inode, attr); + __setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(mnt_userns, dentry, f2fs_get_inode_mode(inode)); + err = posix_acl_chmod(idmap, dentry, f2fs_get_inode_mode(inode)); if (is_inode_flag_set(inode, FI_ACL_MODE)) { if (!err) @@ -1112,7 +1108,7 @@ int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return 0; } -static int punch_hole(struct inode *inode, loff_t offset, loff_t len) +static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; @@ -1498,6 +1494,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, } f2fs_update_read_extent_cache_range(dn, start, 0, index - start); + f2fs_update_age_extent_cache_range(dn, start, index - start); return ret; } @@ -1684,7 +1681,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) return ret; } -static int expand_inode_data(struct inode *inode, loff_t offset, +static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1697,7 +1694,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, .err_gc_skipped = true, .nr_free_secs = 0 }; pgoff_t pg_start, pg_end; - loff_t new_size = i_size_read(inode); + loff_t new_size; loff_t off_end; block_t expanded = 0; int err; @@ -1745,7 +1742,7 @@ next_alloc: f2fs_unlock_op(sbi); map.m_seg_type = CURSEG_COLD_DATA_PINNED; - err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRE_DIO); file_dont_truncate(inode); f2fs_up_write(&sbi->pin_sem); @@ -1758,7 +1755,7 @@ next_alloc: map.m_len = expanded; } else { - err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRE_AIO); expanded = map.m_len; } out_err: @@ -1809,7 +1806,7 @@ static long f2fs_fallocate(struct file *file, int mode, return -EOPNOTSUPP; /* - * Pinned file should not support partial trucation since the block + * Pinned file should not support partial truncation since the block * can be used by applications. */ if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) && @@ -1832,7 +1829,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (offset >= inode->i_size) goto out; - ret = punch_hole(inode, offset, len); + ret = f2fs_punch_hole(inode, offset, len); } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { ret = f2fs_collapse_range(inode, offset, len); } else if (mode & FALLOC_FL_ZERO_RANGE) { @@ -1840,7 +1837,7 @@ static long f2fs_fallocate(struct file *file, int mode, } else if (mode & FALLOC_FL_INSERT_RANGE) { ret = f2fs_insert_range(inode, offset, len); } else { - ret = expand_inode_data(inode, offset, len, mode); + ret = f2fs_expand_inode_data(inode, offset, len, mode); } if (!ret) { @@ -1859,14 +1856,17 @@ out: static int f2fs_release_file(struct inode *inode, struct file *filp) { /* - * f2fs_relase_file is called at every close calls. So we should + * f2fs_release_file is called at every close calls. So we should * not drop any inmemory pages by close called by other process. */ if (!(filp->f_mode & FMODE_WRITE) || atomic_read(&inode->i_writecount) != 1) return 0; + inode_lock(inode); f2fs_abort_atomic_write(inode, true); + inode_unlock(inode); + return 0; } @@ -1880,8 +1880,13 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) * until all the writers close its file. Since this should be done * before dropping file lock, it needs to do in ->flush. */ - if (F2FS_I(inode)->atomic_write_task == current) + if (F2FS_I(inode)->atomic_write_task == current && + (current->flags & PF_EXITING)) { + inode_lock(inode); f2fs_abort_atomic_write(inode, true); + inode_unlock(inode); + } + return 0; } @@ -2038,14 +2043,14 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) { struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inode *pinode; loff_t isize; int ret; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; if (!S_ISREG(inode->i_mode)) @@ -2087,19 +2092,28 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) goto out; } - /* Create a COW inode for atomic write */ - pinode = f2fs_iget(inode->i_sb, fi->i_pino); - if (IS_ERR(pinode)) { - f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - ret = PTR_ERR(pinode); - goto out; - } + /* Check if the inode already has a COW inode */ + if (fi->cow_inode == NULL) { + /* Create a COW inode for atomic write */ + pinode = f2fs_iget(inode->i_sb, fi->i_pino); + if (IS_ERR(pinode)) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + ret = PTR_ERR(pinode); + goto out; + } - ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); - iput(pinode); - if (ret) { - f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - goto out; + ret = f2fs_get_tmpfile(idmap, pinode, &fi->cow_inode); + iput(pinode); + if (ret) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + goto out; + } + + set_inode_flag(fi->cow_inode, FI_COW_FILE); + clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); + } else { + /* Reuse the already created COW inode */ + f2fs_do_truncate_blocks(fi->cow_inode, 0, true); } f2fs_write_inode(inode, NULL); @@ -2107,8 +2121,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) stat_inc_atomic_inode(inode); set_inode_flag(inode, FI_ATOMIC_FILE); - set_inode_flag(fi->cow_inode, FI_COW_FILE); - clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); isize = i_size_read(inode); fi->original_i_size = isize; @@ -2135,10 +2147,10 @@ out: static int f2fs_ioc_commit_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); int ret; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2167,10 +2179,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) static int f2fs_ioc_abort_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); int ret; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2338,6 +2350,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + u8 encrypt_pw_salt[16]; int err; if (!f2fs_sb_has_encrypt(sbi)) @@ -2362,12 +2375,14 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) goto out_err; } got_it: - if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, - 16)) - err = -EFAULT; + memcpy(encrypt_pw_salt, sbi->raw_super->encrypt_pw_salt, 16); out_err: f2fs_up_write(&sbi->sb_lock); mnt_drop_write_file(filp); + + if (!err && copy_to_user((__u8 __user *)arg, encrypt_pw_salt, 16)) + err = -EFAULT; + return err; } @@ -2524,7 +2539,7 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) return __f2fs_ioc_gc_range(filp, &range); } -static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) +static int f2fs_ioc_write_checkpoint(struct file *filp) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2606,7 +2621,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, */ while (map.m_lblk < pg_end) { map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); if (err) goto out; @@ -2653,7 +2668,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, do_map: map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); if (err) goto clear_out; @@ -3090,7 +3105,7 @@ int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int f2fs_fileattr_set(struct user_namespace *mnt_userns, +int f2fs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -3227,7 +3242,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_len = end - map.m_lblk; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRECACHE); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); if (err) return err; @@ -3238,7 +3253,7 @@ int f2fs_precache_extents(struct inode *inode) return 0; } -static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg) +static int f2fs_ioc_precache_extents(struct file *filp) { return f2fs_precache_extents(file_inode(filp)); } @@ -3942,7 +3957,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) goto out; } - if (inode->i_size != 0) { + if (F2FS_HAS_BLOCKS(inode)) { ret = -EFBIG; goto out; } @@ -3995,7 +4010,7 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) return ret; } -static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) +static int f2fs_ioc_decompress_file(struct file *filp) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -4068,7 +4083,7 @@ out: return ret; } -static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) +static int f2fs_ioc_compress_file(struct file *filp) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -4184,7 +4199,7 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT_RANGE: return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: - return f2fs_ioc_write_checkpoint(filp, arg); + return f2fs_ioc_write_checkpoint(filp); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: @@ -4198,7 +4213,7 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_PIN_FILE: return f2fs_ioc_set_pin_file(filp, arg); case F2FS_IOC_PRECACHE_EXTENTS: - return f2fs_ioc_precache_extents(filp, arg); + return f2fs_ioc_precache_extents(filp); case F2FS_IOC_RESIZE_FS: return f2fs_ioc_resize_fs(filp, arg); case FS_IOC_ENABLE_VERITY: @@ -4224,9 +4239,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_COMPRESS_OPTION: return f2fs_ioc_set_compress_option(filp, arg); case F2FS_IOC_DECOMPRESS_FILE: - return f2fs_ioc_decompress_file(filp, arg); + return f2fs_ioc_decompress_file(filp); case F2FS_IOC_COMPRESS_FILE: - return f2fs_ioc_compress_file(filp, arg); + return f2fs_ioc_compress_file(filp); default: return -ENOTTY; } @@ -4341,6 +4356,27 @@ out: return ret; } +static void f2fs_trace_rw_file_path(struct kiocb *iocb, size_t count, int rw) +{ + struct inode *inode = file_inode(iocb->ki_filp); + char *buf, *path; + + buf = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + if (!buf) + return; + path = dentry_path_raw(file_dentry(iocb->ki_filp), buf, PATH_MAX); + if (IS_ERR(path)) + goto free_buf; + if (rw == WRITE) + trace_f2fs_datawrite_start(inode, iocb->ki_pos, count, + current->pid, path, current->comm); + else + trace_f2fs_dataread_start(inode, iocb->ki_pos, count, + current->pid, path, current->comm); +free_buf: + kfree(buf); +} + static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); @@ -4350,24 +4386,9 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - if (trace_f2fs_dataread_start_enabled()) { - char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); - char *path; - - if (!p) - goto skip_read_trace; + if (trace_f2fs_dataread_start_enabled()) + f2fs_trace_rw_file_path(iocb, iov_iter_count(to), READ); - path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX); - if (IS_ERR(path)) { - kfree(p); - goto skip_read_trace; - } - - trace_f2fs_dataread_start(inode, pos, iov_iter_count(to), - current->pid, path, current->comm); - kfree(p); - } -skip_read_trace: if (f2fs_should_use_dio(inode, iocb, to)) { ret = f2fs_dio_read_iter(iocb, to); } else { @@ -4466,7 +4487,7 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, flag = F2FS_GET_BLOCK_PRE_AIO; } - ret = f2fs_map_blocks(inode, &map, 1, flag); + ret = f2fs_map_blocks(inode, &map, flag); /* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */ if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0)) return ret; @@ -4673,24 +4694,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (preallocated < 0) { ret = preallocated; } else { - if (trace_f2fs_datawrite_start_enabled()) { - char *p = f2fs_kmalloc(F2FS_I_SB(inode), - PATH_MAX, GFP_KERNEL); - char *path; - - if (!p) - goto skip_write_trace; - path = dentry_path_raw(file_dentry(iocb->ki_filp), - p, PATH_MAX); - if (IS_ERR(path)) { - kfree(p); - goto skip_write_trace; - } - trace_f2fs_datawrite_start(inode, orig_pos, orig_count, - current->pid, path, current->comm); - kfree(p); - } -skip_write_trace: + if (trace_f2fs_datawrite_start_enabled()) + f2fs_trace_rw_file_path(iocb, orig_count, WRITE); + /* Do the actual write. */ ret = dio ? f2fs_dio_write_iter(iocb, from, &may_need_sync) : @@ -4823,6 +4829,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC32_MOVE_RANGE: return f2fs_compat_ioc_move_range(file, arg); case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_START_ATOMIC_REPLACE: case F2FS_IOC_COMMIT_ATOMIC_WRITE: case F2FS_IOC_START_VOLATILE_WRITE: case F2FS_IOC_RELEASE_VOLATILE_WRITE: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6e2cae3d2e71..0a9dfa459860 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -57,7 +57,7 @@ static int gc_thread_func(void *data) /* give it a try one time */ if (gc_th->gc_wake) - gc_th->gc_wake = 0; + gc_th->gc_wake = false; if (try_to_freeze()) { stat_other_skip_bggc_count(sbi); @@ -72,11 +72,9 @@ static int gc_thread_func(void *data) continue; } - if (time_to_inject(sbi, FAULT_CHECKPOINT)) { - f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); + if (time_to_inject(sbi, FAULT_CHECKPOINT)) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); - } if (!sb_start_write_trylock(sbi->sb)) { stat_other_skip_bggc_count(sbi); @@ -185,7 +183,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; - gc_th->gc_wake = 0; + gc_th->gc_wake = false; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); @@ -1150,7 +1148,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index) struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0, }; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, @@ -1159,8 +1156,8 @@ static int ra_data_block(struct inode *inode, pgoff_t index) .op = REQ_OP_READ, .op_flags = 0, .encrypted_page = NULL, - .in_list = false, - .retry = false, + .in_list = 0, + .retry = 0, }; int err; @@ -1168,8 +1165,8 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (!page) return -ENOMEM; - if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { - dn.data_blkaddr = ei.blk + index - ei.fofs; + if (f2fs_lookup_read_extent_cache_block(inode, index, + &dn.data_blkaddr)) { if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { err = -EFSCORRUPTED; @@ -1248,8 +1245,8 @@ static int move_data_block(struct inode *inode, block_t bidx, .op = REQ_OP_READ, .op_flags = 0, .encrypted_page = NULL, - .in_list = false, - .retry = false, + .in_list = 0, + .retry = 0, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -1365,7 +1362,6 @@ static int move_data_block(struct inode *inode, block_t bidx, dec_page_count(fio.sbi, F2FS_DIRTY_META); set_page_writeback(fio.encrypted_page); - ClearPageError(page); fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 19b956c2d697..15bd1d680f67 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -41,7 +41,7 @@ struct f2fs_gc_kthread { unsigned int no_gc_sleep_time; /* for changing gc mode */ - unsigned int gc_wake; + bool gc_wake; /* for GC_MERGE mount option */ wait_queue_head_t fggc_wq; /* diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 21a495234ffd..72269e7efd26 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -174,7 +174,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); - ClearPageError(page); fio.old_blkaddr = dn->data_blkaddr; set_inode_flag(dn->inode, FI_HOT_DATA); f2fs_outplace_write_data(dn, &fio); @@ -422,18 +421,17 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, dentry_blk = page_address(page); + /* + * Start by zeroing the full block, to ensure that all unused space is + * zeroed and no uninitialized memory is leaked to disk. + */ + memset(dentry_blk, 0, F2FS_BLKSIZE); + make_dentry_ptr_inline(dir, &src, inline_dentry); make_dentry_ptr_block(dir, &dst, dentry_blk); /* copy data from inline dentry block to new dentry block */ memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); - memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap); - /* - * we do not need to zero out remainder part of dentry and filename - * field, since we have used bitmap for marking the usage status of - * them, besides, we can also ignore copying/zeroing reserved space - * of dentry block, because them haven't been used so far. - */ memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ff6cf66ed46b..7d2e2c0dba65 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -262,22 +262,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (fi->extent_tree[EX_READ]) { - struct extent_info *ei = &fi->extent_tree[EX_READ]->largest; - - if (ei->len && - (!f2fs_is_valid_blkaddr(sbi, ei->blk, - DATA_GENERIC_ENHANCE) || - !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, - DATA_GENERIC_ENHANCE))) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", - __func__, inode->i_ino, - ei->blk, ei->fofs, ei->len); - return false; - } - } - if (f2fs_sanity_check_inline_data(inode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", @@ -413,12 +397,6 @@ static int do_read_inode(struct inode *inode) fi->i_inline_xattr_size = 0; } - if (!sanity_check_inode(inode, node_page)) { - f2fs_put_page(node_page, 1); - f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); - return -EFSCORRUPTED; - } - /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -466,11 +444,17 @@ static int do_read_inode(struct inode *inode) (fi->i_flags & F2FS_COMPR_FL)) { if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { + unsigned short compress_flag; + atomic_set(&fi->i_compr_blocks, le64_to_cpu(ri->i_compr_blocks)); fi->i_compress_algorithm = ri->i_compress_algorithm; fi->i_log_cluster_size = ri->i_log_cluster_size; - fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag); + compress_flag = le16_to_cpu(ri->i_compress_flag); + fi->i_compress_level = compress_flag >> + COMPRESS_LEVEL_OFFSET; + fi->i_compress_flag = compress_flag & + (BIT(COMPRESS_LEVEL_OFFSET) - 1); fi->i_cluster_size = 1 << fi->i_log_cluster_size; set_inode_flag(inode, FI_COMPRESSED_FILE); } @@ -482,6 +466,18 @@ static int do_read_inode(struct inode *inode) f2fs_init_read_extent_tree(inode, node_page); f2fs_init_age_extent_tree(inode); + if (!sanity_check_inode(inode, node_page)) { + f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + + if (!sanity_check_extent_cache(inode)) { + f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -686,13 +682,17 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (f2fs_sb_has_compression(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_log_cluster_size)) { + unsigned short compress_flag; + ri->i_compr_blocks = cpu_to_le64(atomic_read( &F2FS_I(inode)->i_compr_blocks)); ri->i_compress_algorithm = F2FS_I(inode)->i_compress_algorithm; - ri->i_compress_flag = - cpu_to_le16(F2FS_I(inode)->i_compress_flag); + compress_flag = F2FS_I(inode)->i_compress_flag | + F2FS_I(inode)->i_compress_level << + COMPRESS_LEVEL_OFFSET; + ri->i_compress_flag = cpu_to_le16(compress_flag); ri->i_log_cluster_size = F2FS_I(inode)->i_log_cluster_size; } @@ -714,18 +714,19 @@ void f2fs_update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; + int count = 0; retry: node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); - if (err == -ENOMEM) { - cond_resched(); + /* The node block was truncated. */ + if (err == -ENOENT) + return; + + if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; - } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi, false, - STOP_CP_REASON_UPDATE_INODE); - } + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE); return; } f2fs_update_inode(inode, node_page); @@ -766,11 +767,18 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - nid_t xnid = F2FS_I(inode)->i_xattr_nid; + struct f2fs_inode_info *fi = F2FS_I(inode); + nid_t xnid = fi->i_xattr_nid; int err = 0; f2fs_abort_atomic_write(inode, true); + if (fi->cow_inode) { + clear_inode_flag(fi->cow_inode, FI_COW_FILE); + iput(fi->cow_inode); + fi->cow_inode = NULL; + } + trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -809,10 +817,8 @@ retry: if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); - if (time_to_inject(sbi, FAULT_EVICT_INODE)) { - f2fs_show_injection_info(sbi, FAULT_EVICT_INODE); + if (time_to_inject(sbi, FAULT_EVICT_INODE)) err = -EIO; - } if (!err) { f2fs_lock_op(sbi); @@ -857,7 +863,7 @@ no_delete: stat_dec_inline_inode(inode); stat_dec_compr_inode(inode); stat_sub_compr_blocks(inode, - atomic_read(&F2FS_I(inode)->i_compr_blocks)); + atomic_read(&fi->i_compr_blocks)); if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index 3166a8939ed4..3d5bfb1ad585 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -14,91 +14,79 @@ #include "iostat.h" #include <trace/events/f2fs.h> -#define NUM_PREALLOC_IOSTAT_CTXS 128 static struct kmem_cache *bio_iostat_ctx_cache; static mempool_t *bio_iostat_ctx_pool; +static inline unsigned long long iostat_get_avg_bytes(struct f2fs_sb_info *sbi, + enum iostat_type type) +{ + return sbi->iostat_count[type] ? div64_u64(sbi->iostat_bytes[type], + sbi->iostat_count[type]) : 0; +} + +#define IOSTAT_INFO_SHOW(name, type) \ + seq_printf(seq, "%-23s %-16llu %-16llu %-16llu\n", \ + name":", sbi->iostat_bytes[type], \ + sbi->iostat_count[type], \ + iostat_get_avg_bytes(sbi, type)) + int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); - time64_t now = ktime_get_real_seconds(); if (!sbi->iostat_enable) return 0; - seq_printf(seq, "time: %-16llu\n", now); + seq_printf(seq, "time: %-16llu\n", ktime_get_real_seconds()); + seq_printf(seq, "\t\t\t%-16s %-16s %-16s\n", + "io_bytes", "count", "avg_bytes"); /* print app write IOs */ seq_puts(seq, "[WRITE]\n"); - seq_printf(seq, "app buffered data: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_IO]); - seq_printf(seq, "app direct data: %-16llu\n", - sbi->rw_iostat[APP_DIRECT_IO]); - seq_printf(seq, "app mapped data: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_IO]); - seq_printf(seq, "app buffered cdata: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_CDATA_IO]); - seq_printf(seq, "app mapped cdata: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_CDATA_IO]); + IOSTAT_INFO_SHOW("app buffered data", APP_BUFFERED_IO); + IOSTAT_INFO_SHOW("app direct data", APP_DIRECT_IO); + IOSTAT_INFO_SHOW("app mapped data", APP_MAPPED_IO); + IOSTAT_INFO_SHOW("app buffered cdata", APP_BUFFERED_CDATA_IO); + IOSTAT_INFO_SHOW("app mapped cdata", APP_MAPPED_CDATA_IO); /* print fs write IOs */ - seq_printf(seq, "fs data: %-16llu\n", - sbi->rw_iostat[FS_DATA_IO]); - seq_printf(seq, "fs cdata: %-16llu\n", - sbi->rw_iostat[FS_CDATA_IO]); - seq_printf(seq, "fs node: %-16llu\n", - sbi->rw_iostat[FS_NODE_IO]); - seq_printf(seq, "fs meta: %-16llu\n", - sbi->rw_iostat[FS_META_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", - sbi->rw_iostat[FS_GC_DATA_IO]); - seq_printf(seq, "fs gc node: %-16llu\n", - sbi->rw_iostat[FS_GC_NODE_IO]); - seq_printf(seq, "fs cp data: %-16llu\n", - sbi->rw_iostat[FS_CP_DATA_IO]); - seq_printf(seq, "fs cp node: %-16llu\n", - sbi->rw_iostat[FS_CP_NODE_IO]); - seq_printf(seq, "fs cp meta: %-16llu\n", - sbi->rw_iostat[FS_CP_META_IO]); + IOSTAT_INFO_SHOW("fs data", FS_DATA_IO); + IOSTAT_INFO_SHOW("fs cdata", FS_CDATA_IO); + IOSTAT_INFO_SHOW("fs node", FS_NODE_IO); + IOSTAT_INFO_SHOW("fs meta", FS_META_IO); + IOSTAT_INFO_SHOW("fs gc data", FS_GC_DATA_IO); + IOSTAT_INFO_SHOW("fs gc node", FS_GC_NODE_IO); + IOSTAT_INFO_SHOW("fs cp data", FS_CP_DATA_IO); + IOSTAT_INFO_SHOW("fs cp node", FS_CP_NODE_IO); + IOSTAT_INFO_SHOW("fs cp meta", FS_CP_META_IO); /* print app read IOs */ seq_puts(seq, "[READ]\n"); - seq_printf(seq, "app buffered data: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_READ_IO]); - seq_printf(seq, "app direct data: %-16llu\n", - sbi->rw_iostat[APP_DIRECT_READ_IO]); - seq_printf(seq, "app mapped data: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_READ_IO]); - seq_printf(seq, "app buffered cdata: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO]); - seq_printf(seq, "app mapped cdata: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO]); + IOSTAT_INFO_SHOW("app buffered data", APP_BUFFERED_READ_IO); + IOSTAT_INFO_SHOW("app direct data", APP_DIRECT_READ_IO); + IOSTAT_INFO_SHOW("app mapped data", APP_MAPPED_READ_IO); + IOSTAT_INFO_SHOW("app buffered cdata", APP_BUFFERED_CDATA_READ_IO); + IOSTAT_INFO_SHOW("app mapped cdata", APP_MAPPED_CDATA_READ_IO); /* print fs read IOs */ - seq_printf(seq, "fs data: %-16llu\n", - sbi->rw_iostat[FS_DATA_READ_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", - sbi->rw_iostat[FS_GDATA_READ_IO]); - seq_printf(seq, "fs cdata: %-16llu\n", - sbi->rw_iostat[FS_CDATA_READ_IO]); - seq_printf(seq, "fs node: %-16llu\n", - sbi->rw_iostat[FS_NODE_READ_IO]); - seq_printf(seq, "fs meta: %-16llu\n", - sbi->rw_iostat[FS_META_READ_IO]); + IOSTAT_INFO_SHOW("fs data", FS_DATA_READ_IO); + IOSTAT_INFO_SHOW("fs gc data", FS_GDATA_READ_IO); + IOSTAT_INFO_SHOW("fs cdata", FS_CDATA_READ_IO); + IOSTAT_INFO_SHOW("fs node", FS_NODE_READ_IO); + IOSTAT_INFO_SHOW("fs meta", FS_META_READ_IO); /* print other IOs */ seq_puts(seq, "[OTHER]\n"); - seq_printf(seq, "fs discard: %-16llu\n", - sbi->rw_iostat[FS_DISCARD]); + IOSTAT_INFO_SHOW("fs discard", FS_DISCARD_IO); + IOSTAT_INFO_SHOW("fs flush", FS_FLUSH_IO); return 0; } static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) { - int io, idx = 0; - unsigned int cnt; + int io, idx; struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; unsigned long flags; @@ -106,12 +94,11 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) spin_lock_irqsave(&sbi->iostat_lat_lock, flags); for (idx = 0; idx < MAX_IO_TYPE; idx++) { for (io = 0; io < NR_PAGE_TYPE; io++) { - cnt = io_lat->bio_cnt[idx][io]; iostat_lat[idx][io].peak_lat = jiffies_to_msecs(io_lat->peak_lat[idx][io]); - iostat_lat[idx][io].cnt = cnt; - iostat_lat[idx][io].avg_lat = cnt ? - jiffies_to_msecs(io_lat->sum_lat[idx][io]) / cnt : 0; + iostat_lat[idx][io].cnt = io_lat->bio_cnt[idx][io]; + iostat_lat[idx][io].avg_lat = iostat_lat[idx][io].cnt ? + jiffies_to_msecs(io_lat->sum_lat[idx][io]) / iostat_lat[idx][io].cnt : 0; io_lat->sum_lat[idx][io] = 0; io_lat->peak_lat[idx][io] = 0; io_lat->bio_cnt[idx][io] = 0; @@ -141,9 +128,9 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) msecs_to_jiffies(sbi->iostat_period_ms); for (i = 0; i < NR_IO_TYPE; i++) { - iostat_diff[i] = sbi->rw_iostat[i] - - sbi->prev_rw_iostat[i]; - sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; + iostat_diff[i] = sbi->iostat_bytes[i] - + sbi->prev_iostat_bytes[i]; + sbi->prev_iostat_bytes[i] = sbi->iostat_bytes[i]; } spin_unlock_irqrestore(&sbi->iostat_lock, flags); @@ -159,8 +146,9 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) spin_lock_irq(&sbi->iostat_lock); for (i = 0; i < NR_IO_TYPE; i++) { - sbi->rw_iostat[i] = 0; - sbi->prev_rw_iostat[i] = 0; + sbi->iostat_count[i] = 0; + sbi->iostat_bytes[i] = 0; + sbi->prev_iostat_bytes[i] = 0; } spin_unlock_irq(&sbi->iostat_lock); @@ -169,6 +157,13 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) spin_unlock_irq(&sbi->iostat_lat_lock); } +static inline void __f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + sbi->iostat_bytes[type] += io_bytes; + sbi->iostat_count[type]++; +} + void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) { @@ -178,33 +173,33 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, return; spin_lock_irqsave(&sbi->iostat_lock, flags); - sbi->rw_iostat[type] += io_bytes; + __f2fs_update_iostat(sbi, type, io_bytes); if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) - sbi->rw_iostat[APP_WRITE_IO] += io_bytes; + __f2fs_update_iostat(sbi, APP_WRITE_IO, io_bytes); if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) - sbi->rw_iostat[APP_READ_IO] += io_bytes; + __f2fs_update_iostat(sbi, APP_READ_IO, io_bytes); #ifdef CONFIG_F2FS_FS_COMPRESSION if (inode && f2fs_compressed_file(inode)) { if (type == APP_BUFFERED_IO) - sbi->rw_iostat[APP_BUFFERED_CDATA_IO] += io_bytes; + __f2fs_update_iostat(sbi, APP_BUFFERED_CDATA_IO, io_bytes); if (type == APP_BUFFERED_READ_IO) - sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO] += io_bytes; + __f2fs_update_iostat(sbi, APP_BUFFERED_CDATA_READ_IO, io_bytes); if (type == APP_MAPPED_READ_IO) - sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO] += io_bytes; + __f2fs_update_iostat(sbi, APP_MAPPED_CDATA_READ_IO, io_bytes); if (type == APP_MAPPED_IO) - sbi->rw_iostat[APP_MAPPED_CDATA_IO] += io_bytes; + __f2fs_update_iostat(sbi, APP_MAPPED_CDATA_IO, io_bytes); if (type == FS_DATA_READ_IO) - sbi->rw_iostat[FS_CDATA_READ_IO] += io_bytes; + __f2fs_update_iostat(sbi, FS_CDATA_READ_IO, io_bytes); if (type == FS_DATA_IO) - sbi->rw_iostat[FS_CDATA_IO] += io_bytes; + __f2fs_update_iostat(sbi, FS_CDATA_IO, io_bytes); } #endif @@ -214,49 +209,48 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, } static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, - int rw, bool is_sync) + enum iostat_lat_type lat_type) { unsigned long ts_diff; - unsigned int iotype = iostat_ctx->type; + unsigned int page_type = iostat_ctx->type; struct f2fs_sb_info *sbi = iostat_ctx->sbi; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; - int idx; unsigned long flags; if (!sbi->iostat_enable) return; ts_diff = jiffies - iostat_ctx->submit_ts; - if (iotype >= META_FLUSH) - iotype = META; - - if (rw == 0) { - idx = READ_IO; - } else { - if (is_sync) - idx = WRITE_SYNC_IO; - else - idx = WRITE_ASYNC_IO; + if (page_type == META_FLUSH) { + page_type = META; + } else if (page_type >= NR_PAGE_TYPE) { + f2fs_warn(sbi, "%s: %d over NR_PAGE_TYPE", __func__, page_type); + return; } spin_lock_irqsave(&sbi->iostat_lat_lock, flags); - io_lat->sum_lat[idx][iotype] += ts_diff; - io_lat->bio_cnt[idx][iotype]++; - if (ts_diff > io_lat->peak_lat[idx][iotype]) - io_lat->peak_lat[idx][iotype] = ts_diff; + io_lat->sum_lat[lat_type][page_type] += ts_diff; + io_lat->bio_cnt[lat_type][page_type]++; + if (ts_diff > io_lat->peak_lat[lat_type][page_type]) + io_lat->peak_lat[lat_type][page_type] = ts_diff; spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); } -void iostat_update_and_unbind_ctx(struct bio *bio, int rw) +void iostat_update_and_unbind_ctx(struct bio *bio) { struct bio_iostat_ctx *iostat_ctx = bio->bi_private; - bool is_sync = bio->bi_opf & REQ_SYNC; + enum iostat_lat_type lat_type; - if (rw == 0) - bio->bi_private = iostat_ctx->post_read_ctx; - else + if (op_is_write(bio_op(bio))) { + lat_type = bio->bi_opf & REQ_SYNC ? + WRITE_SYNC_IO : WRITE_ASYNC_IO; bio->bi_private = iostat_ctx->sbi; - __update_iostat_latency(iostat_ctx, rw, is_sync); + } else { + lat_type = READ_IO; + bio->bi_private = iostat_ctx->post_read_ctx; + } + + __update_iostat_latency(iostat_ctx, lat_type); mempool_free(iostat_ctx, bio_iostat_ctx_pool); } diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h index 2c048307b6e0..eb99d05cf272 100644 --- a/fs/f2fs/iostat.h +++ b/fs/f2fs/iostat.h @@ -8,20 +8,21 @@ struct bio_post_read_ctx; +enum iostat_lat_type { + READ_IO = 0, + WRITE_SYNC_IO, + WRITE_ASYNC_IO, + MAX_IO_TYPE, +}; + #ifdef CONFIG_F2FS_IOSTAT +#define NUM_PREALLOC_IOSTAT_CTXS 128 #define DEFAULT_IOSTAT_PERIOD_MS 3000 #define MIN_IOSTAT_PERIOD_MS 100 /* maximum period of iostat tracing is 1 day */ #define MAX_IOSTAT_PERIOD_MS 8640000 -enum { - READ_IO, - WRITE_SYNC_IO, - WRITE_ASYNC_IO, - MAX_IO_TYPE, -}; - struct iostat_lat_info { unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* sum of io latencies */ unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* peak io latency */ @@ -57,7 +58,7 @@ static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) return iostat_ctx->post_read_ctx; } -extern void iostat_update_and_unbind_ctx(struct bio *bio, int rw); +extern void iostat_update_and_unbind_ctx(struct bio *bio); extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, struct bio *bio, struct bio_post_read_ctx *ctx); extern int f2fs_init_iostat_processing(void); @@ -67,7 +68,7 @@ extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); #else static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) {} -static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {} +static inline void iostat_update_and_unbind_ctx(struct bio *bio) {} static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, struct bio *bio, struct bio_post_read_ctx *ctx) {} static inline void iostat_update_submit_ctx(struct bio *bio, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 6032589099ce..11fc4c8036a9 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -202,7 +202,7 @@ static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, file_set_hot(inode); } -static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, +static struct inode *f2fs_new_inode(struct mnt_idmap *idmap, struct inode *dir, umode_t mode, const char *name) { @@ -225,7 +225,7 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, nid_free = true; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; @@ -246,7 +246,7 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; else - F2FS_I(inode)->i_projid = make_kprojid(mnt_userns, + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, F2FS_DEF_PROJID); err = fscrypt_prepare_new_inode(dir, inode, &encrypt); @@ -333,7 +333,7 @@ fail_drop: return ERR_PTR(err); } -static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -350,7 +350,7 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name); + inode = f2fs_new_inode(idmap, dir, mode, dentry->d_name.name); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -659,7 +659,7 @@ static const char *f2fs_get_link(struct dentry *dentry, return link; } -static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -682,7 +682,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL); + inode = f2fs_new_inode(idmap, dir, S_IFLNK | S_IRWXUGO, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -739,7 +739,7 @@ out_free_encrypted_link: return err; } -static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -753,7 +753,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL); + inode = f2fs_new_inode(idmap, dir, S_IFDIR | mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -794,7 +794,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) return -ENOTEMPTY; } -static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -810,7 +810,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); + inode = f2fs_new_inode(idmap, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -837,7 +837,7 @@ out: return err; } -static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode, bool is_whiteout, struct inode **new_inode) { @@ -849,7 +849,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); + inode = f2fs_new_inode(idmap, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -907,7 +907,7 @@ out: return err; } -static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -918,28 +918,25 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = __f2fs_tmpfile(mnt_userns, dir, file, mode, false, NULL); + err = __f2fs_tmpfile(idmap, dir, file, mode, false, NULL); return finish_open_simple(file, err); } -static int f2fs_create_whiteout(struct user_namespace *mnt_userns, +static int f2fs_create_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct inode **whiteout) { - if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) - return -EIO; - - return __f2fs_tmpfile(mnt_userns, dir, NULL, + return __f2fs_tmpfile(idmap, dir, NULL, S_IFCHR | WHITEOUT_MODE, true, whiteout); } -int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode **new_inode) { - return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode); + return __f2fs_tmpfile(idmap, dir, NULL, S_IFREG, false, new_inode); } -static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -966,7 +963,7 @@ static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, /* * If new_inode is null, the below renaming flow will - * add a link in old_dir which can conver inline_dir. + * add a link in old_dir which can convert inline_dir. * After then, if we failed to get the entry due to other * reasons like ENOMEM, we had to remove the new entry. * Instead of adding such the error handling routine, let's @@ -979,7 +976,7 @@ static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, } if (flags & RENAME_WHITEOUT) { - err = f2fs_create_whiteout(mnt_userns, old_dir, &whiteout); + err = f2fs_create_whiteout(idmap, old_dir, &whiteout); if (err) return err; } @@ -1295,7 +1292,7 @@ out: return err; } -static int f2fs_rename2(struct user_namespace *mnt_userns, +static int f2fs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -1318,7 +1315,7 @@ static int f2fs_rename2(struct user_namespace *mnt_userns, * VFS has already handled the new dentry existence case, * here, we just deal with "RENAME_NOREPLACE" as regular rename. */ - return f2fs_rename(mnt_userns, old_dir, old_dentry, + return f2fs_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); } @@ -1342,12 +1339,12 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, return target; } -static int f2fs_encrypted_symlink_getattr(struct user_namespace *mnt_userns, +static int f2fs_encrypted_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - f2fs_getattr(mnt_userns, path, stat, request_mask, query_flags); + f2fs_getattr(idmap, path, stat, request_mask, query_flags); return fscrypt_symlink_getattr(path, stat); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dde4c0458704..bd1dad523796 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1518,23 +1518,24 @@ iput_out: static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index; - struct pagevec pvec; + struct folio_batch fbatch; struct page *last_page = NULL; - int nr_pages; + int nr_folios; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = 0; - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); - pagevec_release(&pvec); + folio_batch_release(&fbatch); return ERR_PTR(-EIO); } @@ -1565,7 +1566,7 @@ continue_unlock: last_page = page; unlock_page(page); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } return last_page; @@ -1586,7 +1587,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, - .submitted = false, + .submitted = 0, .io_type = io_type, .io_wbc = wbc, }; @@ -1650,7 +1651,6 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, } set_page_writeback(page); - ClearPageError(page); fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); @@ -1731,12 +1731,12 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, unsigned int *seq_id) { pgoff_t index; - struct pagevec pvec; + struct folio_batch fbatch; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; - int nr_pages; + int nr_folios; int nwritten = 0; if (atomic) { @@ -1745,20 +1745,21 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, return PTR_ERR_OR_ZERO(last_page); } retry: - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = 0; - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); - pagevec_release(&pvec); + folio_batch_release(&fbatch); ret = -EIO; goto out; } @@ -1824,7 +1825,7 @@ continue_unlock: break; } } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (ret || marked) @@ -1889,17 +1890,18 @@ static bool flush_dirty_inode(struct page *page) void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) { pgoff_t index = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + int nr_folios; - pagevec_init(&pvec); + folio_batch_init(&fbatch); - while ((nr_pages = pagevec_lookup_tag(&pvec, - NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; if (!IS_DNODE(page)) continue; @@ -1926,7 +1928,7 @@ continue_unlock: } unlock_page(page); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } @@ -1936,23 +1938,24 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, bool do_balance, enum iostat_type io_type) { pgoff_t index; - struct pagevec pvec; + struct folio_batch fbatch; int step = 0; int nwritten = 0; int ret = 0; - int nr_pages, done = 0; + int nr_folios, done = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); next_step: index = 0; - while (!done && (nr_pages = pagevec_lookup_tag(&pvec, - NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + while (!done && (nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), + &index, (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; bool submitted = false; /* give a priority to WB_SYNC threads */ @@ -2027,7 +2030,7 @@ write_node: if (--wbc->nr_to_write == 0) break; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (wbc->nr_to_write == 0) { @@ -2079,8 +2082,6 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); f2fs_wait_on_page_writeback(page, NODE, true, false); - if (TestClearPageError(page)) - ret = -EIO; put_page(page); @@ -2544,10 +2545,8 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: - if (time_to_inject(sbi, FAULT_ALLOC_NID)) { - f2fs_show_injection_info(sbi, FAULT_ALLOC_NID); + if (time_to_inject(sbi, FAULT_ALLOC_NID)) return false; - } spin_lock(&nm_i->nid_list_lock); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 77fd453949b1..dfd41908b12d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -258,15 +258,15 @@ static int recover_quota_data(struct inode *inode, struct page *page) attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid)); attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid)); - if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode))) + if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&nop_mnt_idmap, inode))) attr.ia_valid |= ATTR_UID; - if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode))) + if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&nop_mnt_idmap, inode))) attr.ia_valid |= ATTR_GID; if (!attr.ia_valid) return 0; - err = dquot_transfer(&init_user_ns, inode, &attr); + err = dquot_transfer(&nop_mnt_idmap, inode, &attr); if (err) set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); return err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ae3c4e5474ef..227e25836173 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -192,18 +192,18 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) if (!f2fs_is_atomic_file(inode)) return; - clear_inode_flag(fi->cow_inode, FI_COW_FILE); - iput(fi->cow_inode); - fi->cow_inode = NULL; release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_COMMITTED); clear_inode_flag(inode, FI_ATOMIC_REPLACE); clear_inode_flag(inode, FI_ATOMIC_FILE); stat_dec_atomic_inode(inode); + F2FS_I(inode)->atomic_write_task = NULL; + if (clean) { truncate_inode_pages_final(inode->i_mapping); f2fs_i_size_write(inode, fi->original_i_size); + fi->original_i_size = 0; } } @@ -255,6 +255,9 @@ retry: } f2fs_put_dnode(&dn); + + trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode, + index, *old_addr, new_addr, recover); return 0; } @@ -262,19 +265,24 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; + pgoff_t start_index = 0; bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { - if (revoke) + if (revoke) { __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); + } else if (truncate) { + f2fs_truncate_hole(inode, start_index, cur->index); + start_index = cur->index + 1; + } list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } if (!revoke && truncate) - f2fs_do_truncate_blocks(inode, 0, false); + f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); } static int __f2fs_commit_atomic_write(struct inode *inode) @@ -384,10 +392,8 @@ int f2fs_commit_atomic_write(struct inode *inode) */ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { - if (time_to_inject(sbi, FAULT_CHECKPOINT)) { - f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); + if (time_to_inject(sbi, FAULT_CHECKPOINT)) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); - } /* balance_fs_bg is able to be pending */ if (need && excess_cached_nats(sbi)) @@ -508,6 +514,8 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi, trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); + if (!ret) + f2fs_update_iostat(sbi, NULL, FS_FLUSH_IO, 0); return ret; } @@ -1059,7 +1067,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->granularity = granularity; dpolicy->max_requests = dcc->max_discard_request; - dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware_gran = dcc->discard_io_aware_gran; dpolicy->timeout = false; if (discard_type == DPOLICY_BG) { @@ -1095,9 +1103,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, block_t start, block_t len); /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, - struct discard_policy *dpolicy, - struct discard_cmd *dc, - unsigned int *issued) + struct discard_policy *dpolicy, + struct discard_cmd *dc, int *issued) { struct block_device *bdev = dc->bdev; unsigned int max_discard_blocks = @@ -1141,7 +1148,6 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->len += len; if (time_to_inject(sbi, FAULT_DISCARD)) { - f2fs_show_injection_info(sbi, FAULT_DISCARD); err = -EIO; } else { err = __blkdev_issue_discard(bdev, @@ -1186,7 +1192,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, atomic_inc(&dcc->issued_discard); - f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_DISCARD_IO, len * F2FS_BLKSIZE); lstart += len; start += len; @@ -1378,8 +1384,8 @@ static void __queue_discard_cmd(struct f2fs_sb_info *sbi, mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); } -static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, - struct discard_policy *dpolicy) +static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, int *issued) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; @@ -1387,7 +1393,6 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, struct discard_cmd *dc; struct blk_plug plug; unsigned int pos = dcc->next_pos; - unsigned int issued = 0; bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); @@ -1414,9 +1419,9 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, } dcc->next_pos = dc->lstart + dc->len; - err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); + err = __submit_discard_cmd(sbi, dpolicy, dc, issued); - if (issued >= dpolicy->max_requests) + if (*issued >= dpolicy->max_requests) break; next: node = rb_next(&dc->rb_node); @@ -1432,10 +1437,8 @@ next: mutex_unlock(&dcc->cmd_lock); - if (!issued && io_interrupted) - issued = -1; - - return issued; + if (!(*issued) && io_interrupted) + *issued = -1; } static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy); @@ -1463,8 +1466,10 @@ retry: if (i + 1 < dpolicy->granularity) break; - if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) - return __issue_discard_cmd_orderly(sbi, dpolicy); + if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) { + __issue_discard_cmd_orderly(sbi, dpolicy, &issued); + return issued; + } pend_list = &dcc->pend_list[i]; @@ -1609,9 +1614,9 @@ static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); /* wait all */ - __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1); + __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, MIN_DISCARD_GRANULARITY); discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); - __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1); + __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, MIN_DISCARD_GRANULARITY); discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); return discard_blks; @@ -1653,7 +1658,14 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) } } -/* This comes from f2fs_put_super */ +/** + * f2fs_issue_discard_timeout() - Issue all discard cmd within UMOUNT_DISCARD_TIMEOUT + * @sbi: the f2fs_sb_info data for discard cmd to issue + * + * When UMOUNT_DISCARD_TIMEOUT is exceeded, all remaining discard commands will be dropped + * + * Return true if issued all discard cmd or no discard cmd need issue, otherwise return false. + */ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; @@ -1661,7 +1673,7 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) bool dropped; if (!atomic_read(&dcc->discard_cmd_cnt)) - return false; + return true; __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); @@ -1672,7 +1684,7 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) __wait_all_discard_cmd(sbi, NULL); f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt)); - return dropped; + return !dropped; } static int issue_discard_thread(void *data) @@ -1694,13 +1706,14 @@ static int issue_discard_thread(void *data) if (sbi->gc_mode == GC_URGENT_HIGH || !f2fs_available_free_memory(sbi, DISCARD_CACHE)) - __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, + MIN_DISCARD_GRANULARITY); else __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); if (dcc->discard_wake) - dcc->discard_wake = 0; + dcc->discard_wake = false; /* clean up pending candidates before going to sleep */ if (atomic_read(&dcc->queued_discard)) @@ -2065,6 +2078,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; + dcc->discard_io_aware_gran = MAX_PLIST_NUM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) @@ -2327,17 +2341,13 @@ bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) return is_cp; } -/* - * This function should be resided under the curseg_mutex lock - */ -static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, - struct f2fs_summary *sum) +static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); - void *addr = curseg->sum_blk; - addr += curseg->next_blkoff * sizeof(struct f2fs_summary); - memcpy(addr, sum, sizeof(struct f2fs_summary)); + if (sbi->ckpt->alloc_type[type] == SSR) + return sbi->blocks_per_seg; + return curseg->next_blkoff; } /* @@ -2349,15 +2359,11 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) int i, sum_in_page; for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { - if (sbi->ckpt->alloc_type[i] == SSR) - valid_sum_count += sbi->blocks_per_seg; - else { - if (for_ra) - valid_sum_count += le16_to_cpu( - F2FS_CKPT(sbi)->cur_data_blkoff[i]); - else - valid_sum_count += curseg_blkoff(sbi, i); - } + if (sbi->ckpt->alloc_type[i] != SSR && for_ra) + valid_sum_count += + le16_to_cpu(F2FS_CKPT(sbi)->cur_data_blkoff[i]); + else + valid_sum_count += f2fs_curseg_valid_blocks(sbi, i); } sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - @@ -2628,30 +2634,10 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi, return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); } -/* - * If a segment is written by LFS manner, next block offset is just obtained - * by increasing the current block offset. However, if a segment is written by - * SSR manner, next block offset obtained by calling __next_free_blkoff - */ -static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, - struct curseg_info *seg) +static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi, + struct curseg_info *seg) { - if (seg->alloc_type == SSR) { - seg->next_blkoff = - __next_free_blkoff(sbi, seg->segno, - seg->next_blkoff + 1); - } else { - seg->next_blkoff++; - if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) { - /* To allocate block chunks in different sizes, use random number */ - if (--seg->fragment_remained_chunk <= 0) { - seg->fragment_remained_chunk = - get_random_u32_inclusive(1, sbi->max_fragment_chunk); - seg->next_blkoff += - get_random_u32_inclusive(1, sbi->max_fragment_hole); - } - } - } + return __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1); } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) @@ -2909,33 +2895,23 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; - if (!curseg->inited) - goto alloc; - - if (force || curseg->next_blkoff || - get_valid_blocks(sbi, curseg->segno, new_sec)) - goto alloc; - - if (!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) + if (!force && curseg->inited && + !curseg->next_blkoff && + !get_valid_blocks(sbi, curseg->segno, new_sec) && + !get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) return; -alloc: + old_segno = curseg->segno; new_curseg(sbi, type, true); stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); } -static void __allocate_new_section(struct f2fs_sb_info *sbi, - int type, bool force) -{ - __allocate_new_segment(sbi, type, true, force); -} - void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); - __allocate_new_section(sbi, type, force); + __allocate_new_segment(sbi, type, true, force); up_write(&SIT_I(sbi)->sentry_lock); f2fs_up_read(&SM_I(sbi)->curseg_lock); } @@ -3113,13 +3089,6 @@ out: return err; } -static bool __has_curseg_space(struct f2fs_sb_info *sbi, - struct curseg_info *curseg) -{ - return curseg->next_blkoff < f2fs_usable_blks_in_seg(sbi, - curseg->segno); -} - int f2fs_rw_hint_to_seg_type(enum rw_hint hint) { switch (hint) { @@ -3238,6 +3207,19 @@ static int __get_segment_type(struct f2fs_io_info *fio) return type; } +static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, + struct curseg_info *seg) +{ + /* To allocate block chunks in different sizes, use random number */ + if (--seg->fragment_remained_chunk > 0) + return; + + seg->fragment_remained_chunk = + get_random_u32_inclusive(1, sbi->max_fragment_chunk); + seg->next_blkoff += + get_random_u32_inclusive(1, sbi->max_fragment_hole); +} + void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, @@ -3248,6 +3230,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, unsigned long long old_mtime; bool from_gc = (type == CURSEG_ALL_DATA_ATGC); struct seg_entry *se = NULL; + bool segment_full = false; f2fs_down_read(&SM_I(sbi)->curseg_lock); @@ -3266,15 +3249,16 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, f2fs_wait_discard_bio(sbi, *new_blkaddr); - /* - * __add_sum_entry should be resided under the curseg_mutex - * because, this function updates a summary entry in the - * current summary block. - */ - __add_sum_entry(sbi, type, sum); - - __refresh_next_blkoff(sbi, curseg); - + curseg->sum_blk->entries[curseg->next_blkoff] = *sum; + if (curseg->alloc_type == SSR) { + curseg->next_blkoff = f2fs_find_next_ssr_block(sbi, curseg); + } else { + curseg->next_blkoff++; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + f2fs_randomize_chunk(sbi, curseg); + } + if (curseg->next_blkoff >= f2fs_usable_blks_in_seg(sbi, curseg->segno)) + segment_full = true; stat_inc_block_count(sbi, curseg); if (from_gc) { @@ -3293,10 +3277,11 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) update_sit_entry(sbi, old_blkaddr, -1); - if (!__has_curseg_space(sbi, curseg)) { - /* - * Flush out current segment and replace it with new segment. - */ + /* + * If the current segment is full, flush it out and replace it with a + * new segment. + */ + if (segment_full) { if (from_gc) { get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); @@ -3331,10 +3316,10 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_bio_info *io; if (F2FS_IO_ALIGNED(sbi)) - fio->retry = false; + fio->retry = 0; INIT_LIST_HEAD(&fio->list); - fio->in_list = true; + fio->in_list = 1; io = sbi->write_io[fio->type] + fio->temp; spin_lock(&io->io_lock); list_add_tail(&fio->list, &io->io_list); @@ -3415,14 +3400,13 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, - .in_list = false, + .in_list = 0, }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) fio.op_flags &= ~REQ_META; set_page_writeback(page); - ClearPageError(page); f2fs_submit_page_write(&fio); stat_inc_meta_count(sbi, page->index); @@ -3487,7 +3471,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); - if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) + if (fio->bio && !IS_F2FS_IPU_NOCACHE(sbi)) err = f2fs_merge_page_bio(fio); else err = f2fs_submit_page_bio(fio); @@ -3576,7 +3560,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); - __add_sum_entry(sbi, type, sum); + curseg->sum_blk->entries[curseg->next_blkoff] = *sum; if (!recover_curseg || recover_newaddr) { if (!from_gc) @@ -3634,7 +3618,7 @@ void f2fs_wait_on_page_writeback(struct page *page, /* submit cached LFS IO */ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); - /* sbumit cached IPU IO */ + /* submit cached IPU IO */ f2fs_submit_merged_ipu_write(sbi, NULL, page); if (ordered) { wait_on_page_writeback(page); @@ -3885,15 +3869,8 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { - unsigned short blkoff; - seg_i = CURSEG_I(sbi, i); - if (sbi->ckpt->alloc_type[i] == SSR) - blkoff = sbi->blocks_per_seg; - else - blkoff = curseg_blkoff(sbi, i); - - for (j = 0; j < blkoff; j++) { + for (j = 0; j < f2fs_curseg_valid_blocks(sbi, i); j++) { if (!page) { page = f2fs_grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); @@ -5126,7 +5103,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; if (!f2fs_lfs_mode(sbi)) - sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; + sm_info->ipu_policy = BIT(F2FS_IPU_FSYNC); sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; sm_info->min_seq_blocks = sbi->blocks_per_seg; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 3ad1b7b6fa94..efdb7fc3b797 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -670,6 +670,9 @@ static inline int utilization(struct f2fs_sb_info *sbi) #define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */ +#define F2FS_IPU_DISABLE 0 + +/* Modification on enum should be synchronized with ipu_mode_names array */ enum { F2FS_IPU_FORCE, F2FS_IPU_SSR, @@ -679,8 +682,29 @@ enum { F2FS_IPU_ASYNC, F2FS_IPU_NOCACHE, F2FS_IPU_HONOR_OPU_WRITE, + F2FS_IPU_MAX, }; +static inline bool IS_F2FS_IPU_DISABLE(struct f2fs_sb_info *sbi) +{ + return SM_I(sbi)->ipu_policy == F2FS_IPU_DISABLE; +} + +#define F2FS_IPU_POLICY(name) \ +static inline bool IS_##name(struct f2fs_sb_info *sbi) \ +{ \ + return SM_I(sbi)->ipu_policy & BIT(name); \ +} + +F2FS_IPU_POLICY(F2FS_IPU_FORCE); +F2FS_IPU_POLICY(F2FS_IPU_SSR); +F2FS_IPU_POLICY(F2FS_IPU_UTIL); +F2FS_IPU_POLICY(F2FS_IPU_SSR_UTIL); +F2FS_IPU_POLICY(F2FS_IPU_FSYNC); +F2FS_IPU_POLICY(F2FS_IPU_ASYNC); +F2FS_IPU_POLICY(F2FS_IPU_NOCACHE); +F2FS_IPU_POLICY(F2FS_IPU_HONOR_OPU_WRITE); + static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, int type) { @@ -695,15 +719,10 @@ static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi, return curseg->alloc_type; } -static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - return curseg->next_blkoff; -} - -static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) +static inline bool valid_main_segno(struct f2fs_sb_info *sbi, + unsigned int segno) { - f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); + return segno <= (MAIN_SEGS(sbi) - 1); } static inline void verify_fio_blkaddr(struct f2fs_io_info *fio) @@ -758,7 +777,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, /* check segment usage, and check boundary of a given segment number */ if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg - || segno > TOTAL_SEGS(sbi) - 1)) { + || !valid_main_segno(sbi, segno))) { f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -775,7 +794,7 @@ static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int offset = SIT_BLOCK_OFFSET(start); block_t blk_addr = sit_i->sit_base_addr + offset; - check_seg_range(sbi, start); + f2fs_bug_on(sbi, !valid_main_segno(sbi, start)); #ifdef CONFIG_F2FS_CHECK_FS if (f2fs_test_bit(offset, sit_i->sit_bitmap) != @@ -924,6 +943,6 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) if (!wakeup || !is_idle(sbi, DISCARD_TIME)) return; wake_up: - dcc->discard_wake = 1; + dcc->discard_wake = true; wake_up_interruptible_all(&dcc->discard_wait_queue); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1f812b9ce985..fbaaabbcd6de 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -540,12 +540,6 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, opt, err); return -EINVAL; } - err = fscrypt_add_test_dummy_key(sb, policy); - if (err) { - f2fs_warn(sbi, "Error adding test dummy encryption key [%d]", - err); - return err; - } f2fs_warn(sbi, "Test dummy encryption mode enabled"); return 0; } @@ -1294,19 +1288,18 @@ default_check: * zone alignment optimization. This is optional for host-aware * devices, but mandatory for host-managed zoned block devices. */ -#ifndef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sbi)) { - f2fs_err(sbi, "Zoned block device support is not enabled"); - return -EINVAL; - } -#endif if (f2fs_sb_has_blkzoned(sbi)) { +#ifdef CONFIG_BLK_DEV_ZONED if (F2FS_OPTION(sbi).discard_unit != DISCARD_UNIT_SECTION) { f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default"); F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; } +#else + f2fs_err(sbi, "Zoned block device support is not enabled"); + return -EINVAL; +#endif } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -1347,12 +1340,12 @@ default_check: } if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "LFS not compatible with checkpoint=disable"); + f2fs_err(sbi, "LFS is not compatible with checkpoint=disable"); return -EINVAL; } if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "LFS not compatible with ATGC"); + f2fs_err(sbi, "LFS is not compatible with ATGC"); return -EINVAL; } @@ -1372,10 +1365,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) { - f2fs_show_injection_info(F2FS_SB(sb), FAULT_SLAB_ALLOC); + if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) return NULL; - } fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO); if (!fi) @@ -1430,8 +1421,6 @@ static int f2fs_drop_inode(struct inode *inode) atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - f2fs_abort_atomic_write(inode, true); - /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); @@ -1549,7 +1538,7 @@ static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; - bool dropped; + bool done; /* unregister procfs/sysfs entries in advance to avoid race case */ f2fs_unregister_sysfs(sbi); @@ -1579,9 +1568,8 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - dropped = f2fs_issue_discard_timeout(sbi); - - if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) { + done = f2fs_issue_discard_timeout(sbi); + if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && done) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -1906,15 +1894,24 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, GC_MERGE)) seq_puts(seq, ",gc_merge"); + else + seq_puts(seq, ",nogc_merge"); if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, NORECOVERY)) seq_puts(seq, ",norecovery"); - if (test_opt(sbi, DISCARD)) + if (test_opt(sbi, DISCARD)) { seq_puts(seq, ",discard"); - else + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK) + seq_printf(seq, ",discard_unit=%s", "block"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + seq_printf(seq, ",discard_unit=%s", "segment"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + seq_printf(seq, ",discard_unit=%s", "section"); + } else { seq_puts(seq, ",nodiscard"); + } if (test_opt(sbi, NOHEAP)) seq_puts(seq, ",no_heap"); else @@ -2038,13 +2035,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, ATGC)) seq_puts(seq, ",atgc"); - if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK) - seq_printf(seq, ",discard_unit=%s", "block"); - else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) - seq_printf(seq, ",discard_unit=%s", "segment"); - else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) - seq_printf(seq, ",discard_unit=%s", "section"); - if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL) seq_printf(seq, ",memory=%s", "normal"); else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW) @@ -2306,6 +2296,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } #endif + if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) { + err = -EINVAL; + f2fs_warn(sbi, "LFS is not compatible with IPU"); + goto restore_opts; + } + /* disallow enable atgc dynamically */ if (no_atgc == !!test_opt(sbi, ATGC)) { err = -EINVAL; @@ -2595,10 +2591,8 @@ retry: int f2fs_dquot_initialize(struct inode *inode) { - if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) { - f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_DQUOT_INIT); + if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) return -ESRCH; - } return dquot_initialize(inode); } @@ -4089,8 +4083,9 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) if (f2fs_block_unit_discard(sbi)) SM_I(sbi)->dcc_info->discard_granularity = MIN_DISCARD_GRANULARITY; - SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE | - 1 << F2FS_IPU_HONOR_OPU_WRITE; + if (!f2fs_lfs_mode(sbi)) + SM_I(sbi)->ipu_policy = BIT(F2FS_IPU_FORCE) | + BIT(F2FS_IPU_HONOR_OPU_WRITE); } sbi->readdir_ra = true; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 83a366f3ee80..0b19163c90d4 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -473,6 +473,17 @@ out: return count; } + if (!strcmp(a->attr.name, "discard_io_aware_gran")) { + if (t > MAX_PLIST_NUM) + return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; + if (t == *ui) + return count; + *ui = t; + return count; + } + if (!strcmp(a->attr.name, "discard_granularity")) { if (t == 0 || t > MAX_PLIST_NUM) return -EINVAL; @@ -511,7 +522,7 @@ out: } else if (t == 1) { sbi->gc_mode = GC_URGENT_HIGH; if (sbi->gc_thread) { - sbi->gc_thread->gc_wake = 1; + sbi->gc_thread->gc_wake = true; wake_up_interruptible_all( &sbi->gc_thread->gc_wait_queue_head); wake_up_discard_thread(sbi, true); @@ -521,7 +532,7 @@ out: } else if (t == 3) { sbi->gc_mode = GC_URGENT_MID; if (sbi->gc_thread) { - sbi->gc_thread->gc_wake = 1; + sbi->gc_thread->gc_wake = true; wake_up_interruptible_all( &sbi->gc_thread->gc_wait_queue_head); } @@ -678,7 +689,16 @@ out: } if (!strcmp(a->attr.name, "warm_data_age_threshold")) { - if (t == 0 || t <= sbi->hot_data_age_threshold) + if (t <= sbi->hot_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "last_age_weight")) { + if (t > 100) return -EINVAL; if (t == *ui) return count; @@ -686,6 +706,15 @@ out: return count; } + if (!strcmp(a->attr.name, "ipu_policy")) { + if (t >= BIT(F2FS_IPU_MAX)) + return -EINVAL; + if (t && f2fs_lfs_mode(sbi)) + return -EINVAL; + SM_I(sbi)->ipu_policy = (unsigned int)t; + return count; + } + *ui = (unsigned int)t; return count; @@ -825,6 +854,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_req F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_io_aware_gran, discard_io_aware_gran); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard); @@ -944,6 +974,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block) /* For block age extent cache */ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, last_age_weight, last_age_weight); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -960,6 +991,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_discard_issue_time), ATTR_LIST(mid_discard_issue_time), ATTR_LIST(max_discard_issue_time), + ATTR_LIST(discard_io_aware_gran), ATTR_LIST(discard_urgent_util), ATTR_LIST(discard_granularity), ATTR_LIST(max_ordered_discard), @@ -1042,6 +1074,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(revoked_atomic_block), ATTR_LIST(hot_data_age_threshold), ATTR_LIST(warm_data_age_threshold), + ATTR_LIST(last_age_weight), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -1129,13 +1162,13 @@ static const struct sysfs_ops f2fs_attr_ops = { .store = f2fs_attr_store, }; -static struct kobj_type f2fs_sb_ktype = { +static const struct kobj_type f2fs_sb_ktype = { .default_groups = f2fs_groups, .sysfs_ops = &f2fs_attr_ops, .release = f2fs_sb_release, }; -static struct kobj_type f2fs_ktype = { +static const struct kobj_type f2fs_ktype = { .sysfs_ops = &f2fs_attr_ops, }; @@ -1143,7 +1176,7 @@ static struct kset f2fs_kset = { .kobj = {.ktype = &f2fs_ktype}, }; -static struct kobj_type f2fs_feat_ktype = { +static const struct kobj_type f2fs_feat_ktype = { .default_groups = f2fs_feat_groups, .sysfs_ops = &f2fs_attr_ops, }; @@ -1184,7 +1217,7 @@ static const struct sysfs_ops f2fs_stat_attr_ops = { .store = f2fs_stat_attr_store, }; -static struct kobj_type f2fs_stat_ktype = { +static const struct kobj_type f2fs_stat_ktype = { .default_groups = f2fs_stat_groups, .sysfs_ops = &f2fs_stat_attr_ops, .release = f2fs_stat_kobj_release, @@ -1211,7 +1244,7 @@ static const struct sysfs_ops f2fs_feature_list_attr_ops = { .show = f2fs_sb_feat_attr_show, }; -static struct kobj_type f2fs_feature_list_ktype = { +static const struct kobj_type f2fs_feature_list_ktype = { .default_groups = f2fs_sb_feat_groups, .sysfs_ops = &f2fs_feature_list_attr_ops, .release = f2fs_feature_list_kobj_release, diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index c352fff88a5e..4fc95f353a7a 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -81,7 +81,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *fsdata; + void *fsdata = NULL; int res; res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); @@ -276,11 +276,11 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, } static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, - u64 index, int log_blocksize) + u64 pos, unsigned int size) { - loff_t pos = f2fs_verity_metadata_pos(inode) + (index << log_blocksize); + pos += f2fs_verity_metadata_pos(inode); - return pagecache_write(inode, buf, 1 << log_blocksize, pos); + return pagecache_write(inode, buf, size, pos); } const struct fsverity_operations f2fs_verityops = { diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index dc2e8637189e..d92edbbdc30e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -65,7 +65,7 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, } static int f2fs_xattr_generic_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -109,7 +109,7 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler, } static int f2fs_xattr_advise_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -117,7 +117,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, unsigned char old_advise = F2FS_I(inode)->i_advise; unsigned char new_advise; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EPERM; if (value == NULL) return -EINVAL; diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index 238cc55f84c4..afe83b4e7172 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -2,6 +2,7 @@ config FAT_FS tristate select NLS + select LEGACY_DIRECT_IO help If you want to use one of the FAT-based file systems (the MS-DOS and VFAT (Windows 95) file systems), then you must say Y or M here diff --git a/fs/fat/fat.h b/fs/fat/fat.h index a415c02ede39..e3b690b48e3e 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -398,10 +398,10 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; -extern int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int fat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void fat_truncate_blocks(struct inode *inode, loff_t offset); -extern int fat_getattr(struct user_namespace *mnt_userns, +extern int fat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, diff --git a/fs/fat/file.c b/fs/fat/file.c index 8a6b493b5b5f..795a4fad5c40 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -90,13 +90,13 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) * out the RO attribute for checking by the security * module, just because it maps to a file mode. */ - err = security_inode_setattr(file_mnt_user_ns(file), + err = security_inode_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia); if (err) goto out_unlock_inode; /* This MUST be done before doing anything irreversible... */ - err = fat_setattr(file_mnt_user_ns(file), file->f_path.dentry, &ia); + err = fat_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia); if (err) goto out_unlock_inode; @@ -395,13 +395,13 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset) fat_flush_inodes(inode->i_sb, inode, NULL); } -int fat_getattr(struct user_namespace *mnt_userns, const struct path *path, +int fat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); stat->blksize = sbi->cluster_size; if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) { @@ -456,14 +456,14 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi, return 0; } -static int fat_allow_set_time(struct user_namespace *mnt_userns, +static int fat_allow_set_time(struct mnt_idmap *idmap, struct msdos_sb_info *sbi, struct inode *inode) { umode_t allow_utime = sbi->options.allow_utime; - if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) { - if (vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) + if (vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) allow_utime >>= 3; if (allow_utime & MAY_WRITE) return 1; @@ -477,7 +477,7 @@ static int fat_allow_set_time(struct user_namespace *mnt_userns, /* valid file mode bits */ #define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) -int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int fat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); @@ -488,11 +488,11 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, /* Check for setting the inode time. */ ia_valid = attr->ia_valid; if (ia_valid & TIMES_SET_FLAGS) { - if (fat_allow_set_time(mnt_userns, sbi, inode)) + if (fat_allow_set_time(idmap, sbi, inode)) attr->ia_valid &= ~TIMES_SET_FLAGS; } - error = setattr_prepare(mnt_userns, dentry, attr); + error = setattr_prepare(idmap, dentry, attr); attr->ia_valid = ia_valid; if (error) { if (sbi->options.quiet) @@ -518,10 +518,10 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } if (((attr->ia_valid & ATTR_UID) && - (!uid_eq(from_vfsuid(mnt_userns, i_user_ns(inode), attr->ia_vfsuid), + (!uid_eq(from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid), sbi->options.fs_uid))) || ((attr->ia_valid & ATTR_GID) && - (!gid_eq(from_vfsgid(mnt_userns, i_user_ns(inode), attr->ia_vfsgid), + (!gid_eq(from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid), sbi->options.fs_gid))) || ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~FAT_VALID_MODE))) @@ -564,7 +564,7 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, fat_truncate_time(inode, &attr->ia_mtime, S_MTIME); attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME); - setattr_copy(mnt_userns, inode, attr); + setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); out: return error; diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index efba301d68ae..2116c486843b 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -261,7 +261,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, } /***** Create a file */ -static int msdos_create(struct user_namespace *mnt_userns, struct inode *dir, +static int msdos_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; @@ -339,7 +339,7 @@ out: } /***** Make a directory */ -static int msdos_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; @@ -594,7 +594,7 @@ error_inode: } /***** Rename, a wrapper for rename_same_dir & rename_diff_dir */ -static int msdos_rename(struct user_namespace *mnt_userns, +static int msdos_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 21620054e1c4..c4d00999a433 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -200,7 +200,7 @@ static const struct dentry_operations vfat_dentry_ops = { /* Characters that are undesirable in an MS-DOS file name */ -static inline wchar_t vfat_bad_char(wchar_t w) +static inline bool vfat_bad_char(wchar_t w) { return (w < 0x0020) || (w == '*') || (w == '?') || (w == '<') || (w == '>') @@ -208,7 +208,7 @@ static inline wchar_t vfat_bad_char(wchar_t w) || (w == '\\'); } -static inline wchar_t vfat_replace_char(wchar_t w) +static inline bool vfat_replace_char(wchar_t w) { return (w == '[') || (w == ']') || (w == ';') || (w == ',') || (w == '+') || (w == '='); @@ -756,7 +756,7 @@ error: return ERR_PTR(err); } -static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir, +static int vfat_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; @@ -844,7 +844,7 @@ out: return err; } -static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; @@ -1158,7 +1158,7 @@ error_exchange: goto out; } -static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, +static int vfat_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/fcntl.c b/fs/fcntl.c index 146c9ab0cd4b..b622be119706 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/sched/task.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/capability.h> @@ -47,7 +48,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) /* O_NOATIME can only be set by the owner or superuser */ if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) - if (!inode_owner_or_capable(file_mnt_user_ns(filp), inode)) + if (!inode_owner_or_capable(file_mnt_idmap(filp), inode)) return -EPERM; /* required for strict SunOS emulation */ diff --git a/fs/file_table.c b/fs/file_table.c index dd88701e54a9..372653b92617 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig index c05c71d57291..0e2fc08f7de4 100644 --- a/fs/freevxfs/Kconfig +++ b/fs/freevxfs/Kconfig @@ -8,7 +8,7 @@ config VXFS_FS of SCO UnixWare (and possibly others) and optionally available for Sunsoft Solaris, HP-UX and many other operating systems. However these particular OS implementations of vxfs may differ in on-disk - data endianess and/or superblock offset. The vxfs module has been + data endianness and/or superblock offset. The vxfs module has been tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.) Currently only readonly access is supported and VxFX versions 2, 3 and 4. Tests were performed with HP-UX VxFS version 3. diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c index c99282df7761..f439877ea6e8 100644 --- a/fs/freevxfs/vxfs_subr.c +++ b/fs/freevxfs/vxfs_subr.c @@ -31,7 +31,7 @@ vxfs_put_page(struct page *pp) /** * vxfs_get_page - read a page into memory. - * @ip: inode to read from + * @mapping: mapping to read from * @n: page number * * Description: @@ -81,14 +81,14 @@ vxfs_bread(struct inode *ip, int block) } /** - * vxfs_get_block - locate buffer for given inode,block tuple + * vxfs_getblk - locate buffer for given inode,block tuple * @ip: inode * @iblock: logical block * @bp: buffer skeleton * @create: %TRUE if blocks may be newly allocated. * * Description: - * The vxfs_get_block function fills @bp with the right physical + * The vxfs_getblk function fills @bp with the right physical * block and device number to perform a lowlevel read/write on * it. * diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index c3b82f716f9a..310d73e254df 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -165,7 +165,7 @@ static int vxfs_try_sb_magic(struct super_block *sbp, int silent, } /** - * vxfs_read_super - read superblock into memory and initialize filesystem + * vxfs_fill_super - read superblock into memory and initialize filesystem * @sbp: VFS superblock (to fill) * @dp: fs private mount data * @silent: do not complain loudly when sth is wrong diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6fba5a52127b..195dc23e0d83 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -237,7 +237,7 @@ void wb_wait_for_completion(struct wb_completion *done) static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; -void __inode_attach_wb(struct inode *inode, struct page *page) +void __inode_attach_wb(struct inode *inode, struct folio *folio) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; @@ -245,8 +245,8 @@ void __inode_attach_wb(struct inode *inode, struct page *page) if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; - if (page) { - memcg_css = mem_cgroup_css_from_page(page); + if (folio) { + memcg_css = mem_cgroup_css_from_folio(folio); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); } else { /* must pin memcg_css, see wb_get_create() */ @@ -859,6 +859,7 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes) { + struct folio *folio; struct cgroup_subsys_state *css; int id; @@ -871,7 +872,8 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, if (!wbc->wb || wbc->no_cgroup_owner) return; - css = mem_cgroup_css_from_page(page); + folio = page_folio(page); + css = mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) return; diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c index ab8ceddf9efa..cdf991bdd9de 100644 --- a/fs/fscache/volume.c +++ b/fs/fscache/volume.c @@ -141,13 +141,14 @@ static bool fscache_is_acquire_pending(struct fscache_volume *volume) static void fscache_wait_on_volume_collision(struct fscache_volume *candidate, unsigned int collidee_debug_id) { - wait_var_event_timeout(&candidate->flags, - !fscache_is_acquire_pending(candidate), 20 * HZ); + wait_on_bit_timeout(&candidate->flags, FSCACHE_VOLUME_ACQUIRE_PENDING, + TASK_UNINTERRUPTIBLE, 20 * HZ); if (fscache_is_acquire_pending(candidate)) { pr_notice("Potential volume collision new=%08x old=%08x", candidate->debug_id, collidee_debug_id); fscache_stat(&fscache_n_volumes_collision); - wait_var_event(&candidate->flags, !fscache_is_acquire_pending(candidate)); + wait_on_bit(&candidate->flags, FSCACHE_VOLUME_ACQUIRE_PENDING, + TASK_UNINTERRUPTIBLE); } } @@ -279,8 +280,7 @@ static void fscache_create_volume_work(struct work_struct *work) fscache_end_cache_access(volume->cache, fscache_access_acquire_volume_end); - clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags); - wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING); + clear_and_wake_up_bit(FSCACHE_VOLUME_CREATING, &volume->flags); fscache_put_volume(volume, fscache_volume_put_create_work); } @@ -347,8 +347,8 @@ static void fscache_wake_pending_volume(struct fscache_volume *volume, hlist_bl_for_each_entry(cursor, p, h, hash_link) { if (fscache_volume_same(cursor, volume)) { fscache_see_volume(cursor, fscache_volume_see_hash_wake); - clear_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &cursor->flags); - wake_up_bit(&cursor->flags, FSCACHE_VOLUME_ACQUIRE_PENDING); + clear_and_wake_up_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, + &cursor->flags); return; } } diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index a4850aee2639..3d192b80a561 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -11,9 +11,10 @@ #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> -struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) +static struct posix_acl *__fuse_get_acl(struct fuse_conn *fc, + struct mnt_idmap *idmap, + struct inode *inode, int type, bool rcu) { - struct fuse_conn *fc = get_fuse_conn(inode); int size; const char *name; void *value = NULL; @@ -25,7 +26,7 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) if (fuse_is_bad(inode)) return ERR_PTR(-EIO); - if (!fc->posix_acl || fc->no_getxattr) + if (fc->no_getxattr) return NULL; if (type == ACL_TYPE_ACCESS) @@ -53,7 +54,47 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) return acl; } -int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +static inline bool fuse_no_acl(const struct fuse_conn *fc, + const struct inode *inode) +{ + /* + * Refuse interacting with POSIX ACLs for daemons that + * don't support FUSE_POSIX_ACL and are not mounted on + * the host to retain backwards compatibility. + */ + return !fc->posix_acl && (i_user_ns(inode) != &init_user_ns); +} + +struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type) +{ + struct inode *inode = d_inode(dentry); + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fuse_no_acl(fc, inode)) + return ERR_PTR(-EOPNOTSUPP); + + return __fuse_get_acl(fc, idmap, inode, type, false); +} + +struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + /* + * FUSE daemons before FUSE_POSIX_ACL was introduced could get and set + * POSIX ACLs without them being used for permission checking by the + * vfs. Retain that behavior for backwards compatibility as there are + * filesystems that do all permission checking for acls in the daemon + * and not in the kernel. + */ + if (!fc->posix_acl) + return NULL; + + return __fuse_get_acl(fc, &nop_mnt_idmap, inode, type, rcu); +} + +int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct inode *inode = d_inode(dentry); @@ -64,7 +105,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (fuse_is_bad(inode)) return -EIO; - if (!fc->posix_acl || fc->no_setxattr) + if (fc->no_setxattr || fuse_no_acl(fc, inode)) return -EOPNOTSUPP; if (type == ACL_TYPE_ACCESS) @@ -99,8 +140,14 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, return ret; } - if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) && - !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) + /* + * Fuse daemons without FUSE_POSIX_ACL never changed the passed + * through POSIX ACLs. Such daemons don't expect setgid bits to + * be stripped. + */ + if (fc->posix_acl && + !vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) && + !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); @@ -108,8 +155,15 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, } else { ret = fuse_removexattr(inode, name); } - forget_all_cached_acls(inode); - fuse_invalidate_attr(inode); + + if (fc->posix_acl) { + /* + * Fuse daemons without FUSE_POSIX_ACL never cached POSIX ACLs + * and didn't invalidate attributes. Retain that behavior. + */ + forget_all_cached_acls(inode); + fuse_invalidate_attr(inode); + } return ret; } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index a06fbb1a8a5b..179a5c5e28fd 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -256,7 +256,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) } /** - * cuse_parse_dev_info - parse device info + * cuse_parse_devinfo - parse device info * @p: device info string * @len: length of device info string * @devinfo: out parameter for parsed device info diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index e23e802a8013..8e74f278a3f6 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -860,7 +860,7 @@ int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &fuse_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vm_flags_set(vma, VM_MIXEDMAP | VM_HUGEPAGE); return 0; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e8b60ce72c9a..eb4f88e3dc97 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -204,7 +204,7 @@ static unsigned int fuse_req_hash(u64 unique) return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } -/** +/* * A new request is available, wake fiq->waitq */ static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq) @@ -476,6 +476,8 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) req->in.h.opcode = args->opcode; req->in.h.nodeid = args->nodeid; req->args = args; + if (args->is_ext) + req->in.h.total_extlen = args->in_args[args->ext_idx].size / 8; if (args->end) __set_bit(FR_ASYNC, &req->flags); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index cd1a071b625a..35bc174f9ba2 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -145,7 +145,7 @@ static void fuse_dir_changed(struct inode *dir) inode_maybe_inc_iversion(dir, false); } -/** +/* * Mark the attributes as stale due to an atime change. Avoid the invalidate if * atime is not used. */ @@ -466,7 +466,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, } static int get_security_context(struct dentry *entry, umode_t mode, - void **security_ctx, u32 *security_ctxlen) + struct fuse_in_arg *ext) { struct fuse_secctx *fctx; struct fuse_secctx_header *header; @@ -513,14 +513,100 @@ static int get_security_context(struct dentry *entry, umode_t mode, memcpy(ptr, ctx, ctxlen); } - *security_ctxlen = total_len; - *security_ctx = header; + ext->size = total_len; + ext->value = header; err = 0; out_err: kfree(ctx); return err; } +static void *extend_arg(struct fuse_in_arg *buf, u32 bytes) +{ + void *p; + u32 newlen = buf->size + bytes; + + p = krealloc(buf->value, newlen, GFP_KERNEL); + if (!p) { + kfree(buf->value); + buf->size = 0; + buf->value = NULL; + return NULL; + } + + memset(p + buf->size, 0, bytes); + buf->value = p; + buf->size = newlen; + + return p + newlen - bytes; +} + +static u32 fuse_ext_size(size_t size) +{ + return FUSE_REC_ALIGN(sizeof(struct fuse_ext_header) + size); +} + +/* + * This adds just a single supplementary group that matches the parent's group. + */ +static int get_create_supp_group(struct inode *dir, struct fuse_in_arg *ext) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_ext_header *xh; + struct fuse_supp_groups *sg; + kgid_t kgid = dir->i_gid; + gid_t parent_gid = from_kgid(fc->user_ns, kgid); + u32 sg_len = fuse_ext_size(sizeof(*sg) + sizeof(sg->groups[0])); + + if (parent_gid == (gid_t) -1 || gid_eq(kgid, current_fsgid()) || + !in_group_p(kgid)) + return 0; + + xh = extend_arg(ext, sg_len); + if (!xh) + return -ENOMEM; + + xh->size = sg_len; + xh->type = FUSE_EXT_GROUPS; + + sg = (struct fuse_supp_groups *) &xh[1]; + sg->nr_groups = 1; + sg->groups[0] = parent_gid; + + return 0; +} + +static int get_create_ext(struct fuse_args *args, + struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); + struct fuse_in_arg ext = { .size = 0, .value = NULL }; + int err = 0; + + if (fc->init_security) + err = get_security_context(dentry, mode, &ext); + if (!err && fc->create_supp_group) + err = get_create_supp_group(dir, &ext); + + if (!err && ext.size) { + WARN_ON(args->in_numargs >= ARRAY_SIZE(args->in_args)); + args->is_ext = true; + args->ext_idx = args->in_numargs++; + args->in_args[args->ext_idx] = ext; + } else { + kfree(ext.value); + } + + return err; +} + +static void free_ext_value(struct fuse_args *args) +{ + if (args->is_ext) + kfree(args->in_args[args->ext_idx].value); +} + /* * Atomic create+open operation * @@ -541,8 +627,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; - void *security_ctx = NULL; - u32 security_ctxlen; bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ @@ -586,19 +670,12 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[1].size = sizeof(outopen); args.out_args[1].value = &outopen; - if (fm->fc->init_security) { - err = get_security_context(entry, mode, &security_ctx, - &security_ctxlen); - if (err) - goto out_put_forget_req; - - args.in_numargs = 3; - args.in_args[2].size = security_ctxlen; - args.in_args[2].value = security_ctx; - } + err = get_create_ext(&args, dir, entry, mode); + if (err) + goto out_put_forget_req; err = fuse_simple_request(fm, &args); - kfree(security_ctx); + free_ext_value(&args); if (err) goto out_free_ff; @@ -645,7 +722,7 @@ out_err: return err; } -static int fuse_mknod(struct user_namespace *, struct inode *, struct dentry *, +static int fuse_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, @@ -686,7 +763,7 @@ out_dput: return err; mknod: - err = fuse_mknod(&init_user_ns, dir, entry, mode, 0); + err = fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); if (err) goto out_dput; no_open: @@ -705,8 +782,6 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, struct dentry *d; int err; struct fuse_forget_link *forget; - void *security_ctx = NULL; - u32 security_ctxlen; if (fuse_is_bad(dir)) return -EIO; @@ -721,21 +796,14 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, args->out_args[0].size = sizeof(outarg); args->out_args[0].value = &outarg; - if (fm->fc->init_security && args->opcode != FUSE_LINK) { - err = get_security_context(entry, mode, &security_ctx, - &security_ctxlen); + if (args->opcode != FUSE_LINK) { + err = get_create_ext(args, dir, entry, mode); if (err) goto out_put_forget_req; - - BUG_ON(args->in_numargs != 2); - - args->in_numargs = 3; - args->in_args[2].size = security_ctxlen; - args->in_args[2].value = security_ctx; } err = fuse_simple_request(fm, args); - kfree(security_ctx); + free_ext_value(args); if (err) goto out_put_forget_req; @@ -773,7 +841,7 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, return err; } -static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; @@ -796,13 +864,13 @@ static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir, return create_new_entry(fm, &args, dir, entry, mode); } -static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, bool excl) { - return fuse_mknod(&init_user_ns, dir, entry, mode, 0); + return fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); } -static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct fuse_conn *fc = get_fuse_conn(dir); @@ -819,7 +887,7 @@ static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; @@ -841,7 +909,7 @@ static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, return create_new_entry(fm, &args, dir, entry, S_IFDIR); } -static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, const char *link) { struct fuse_mount *fm = get_fuse_mount(dir); @@ -998,7 +1066,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir, +static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags) { @@ -1156,7 +1224,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else if (stat) { - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; } @@ -1326,7 +1394,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) * access request is sent. Execute permission is still checked * locally based on file mode. */ -static int fuse_permission(struct user_namespace *mnt_userns, +static int fuse_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -1358,7 +1426,7 @@ static int fuse_permission(struct user_namespace *mnt_userns, } if (fc->default_permissions) { - err = generic_permission(&init_user_ns, inode, mask); + err = generic_permission(&nop_mnt_idmap, inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root @@ -1366,7 +1434,7 @@ static int fuse_permission(struct user_namespace *mnt_userns, if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) - err = generic_permission(&init_user_ns, + err = generic_permission(&nop_mnt_idmap, inode, mask); } @@ -1690,7 +1758,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; @@ -1837,7 +1905,7 @@ error: return err; } -static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry, +static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, struct iattr *attr) { struct inode *inode = d_inode(entry); @@ -1900,7 +1968,7 @@ static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry, return ret; } -static int fuse_getattr(struct user_namespace *mnt_userns, +static int fuse_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -1942,7 +2010,8 @@ static const struct inode_operations fuse_dir_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, - .get_inode_acl = fuse_get_acl, + .get_inode_acl = fuse_get_inode_acl, + .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, @@ -1964,7 +2033,8 @@ static const struct inode_operations fuse_common_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, - .get_inode_acl = fuse_get_acl, + .get_inode_acl = fuse_get_inode_acl, + .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 875314ee6f59..de37a3a06a71 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -18,6 +18,8 @@ #include <linux/falloc.h> #include <linux/uio.h> #include <linux/fs.h> +#include <linux/filelock.h> +#include <linux/file.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -477,48 +479,36 @@ static void fuse_sync_writes(struct inode *inode) fuse_release_nowrite(inode); } -static int fuse_flush(struct file *file, fl_owner_t id) -{ - struct inode *inode = file_inode(file); - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_file *ff = file->private_data; +struct fuse_flush_args { + struct fuse_args args; struct fuse_flush_in inarg; - FUSE_ARGS(args); - int err; - - if (fuse_is_bad(inode)) - return -EIO; + struct work_struct work; + struct file *file; +}; - if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) - return 0; +static int fuse_do_flush(struct fuse_flush_args *fa) +{ + int err; + struct inode *inode = file_inode(fa->file); + struct fuse_mount *fm = get_fuse_mount(inode); err = write_inode_now(inode, 1); if (err) - return err; + goto out; inode_lock(inode); fuse_sync_writes(inode); inode_unlock(inode); - err = filemap_check_errors(file->f_mapping); + err = filemap_check_errors(fa->file->f_mapping); if (err) - return err; + goto out; err = 0; if (fm->fc->no_flush) goto inval_attr_out; - memset(&inarg, 0, sizeof(inarg)); - inarg.fh = ff->fh; - inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); - args.opcode = FUSE_FLUSH; - args.nodeid = get_node_id(inode); - args.in_numargs = 1; - args.in_args[0].size = sizeof(inarg); - args.in_args[0].value = &inarg; - args.force = true; - - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(fm, &fa->args); if (err == -ENOSYS) { fm->fc->no_flush = 1; err = 0; @@ -531,9 +521,57 @@ inval_attr_out: */ if (!err && fm->fc->writeback_cache) fuse_invalidate_attr_mask(inode, STATX_BLOCKS); + +out: + fput(fa->file); + kfree(fa); return err; } +static void fuse_flush_async(struct work_struct *work) +{ + struct fuse_flush_args *fa = container_of(work, typeof(*fa), work); + + fuse_do_flush(fa); +} + +static int fuse_flush(struct file *file, fl_owner_t id) +{ + struct fuse_flush_args *fa; + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_file *ff = file->private_data; + + if (fuse_is_bad(inode)) + return -EIO; + + if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) + return 0; + + fa = kzalloc(sizeof(*fa), GFP_KERNEL); + if (!fa) + return -ENOMEM; + + fa->inarg.fh = ff->fh; + fa->inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); + fa->args.opcode = FUSE_FLUSH; + fa->args.nodeid = get_node_id(inode); + fa->args.in_numargs = 1; + fa->args.in_args[0].size = sizeof(fa->inarg); + fa->args.in_args[0].value = &fa->inarg; + fa->args.force = true; + fa->file = get_file(file); + + /* Don't wait if the task is exiting */ + if (current->flags & PF_EXITING) { + INIT_WORK(&fa->work, fuse_flush_async); + schedule_work(&fa->work); + return 0; + } + + return fuse_do_flush(fa); +} + int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int opcode) { @@ -652,7 +690,7 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) return io->bytes < 0 ? io->size : io->bytes; } -/** +/* * In case of short read, the caller sets 'pos' to the position of * actual end of fuse request in IO request. Otherwise, if bytes_requested * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. @@ -1313,7 +1351,8 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; if (fc->handle_killpriv_v2 && - setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { + setattr_should_drop_suidgid(&nop_mnt_idmap, + file_inode(file))) { goto writethrough; } @@ -2184,7 +2223,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, return false; } -static int fuse_writepages_fill(struct page *page, +static int fuse_writepages_fill(struct folio *folio, struct writeback_control *wbc, void *_data) { struct fuse_fill_wb_data *data = _data; @@ -2203,7 +2242,7 @@ static int fuse_writepages_fill(struct page *page, goto out_unlock; } - if (wpa && fuse_writepage_need_send(fc, page, ap, data)) { + if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; } @@ -2238,7 +2277,7 @@ static int fuse_writepages_fill(struct page *page, data->max_pages = 1; ap = &wpa->ia.ap; - fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0); + fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; wpa->next = NULL; ap->args.in_pages = true; @@ -2246,13 +2285,13 @@ static int fuse_writepages_fill(struct page *page, ap->num_pages = 0; wpa->inode = inode; } - set_page_writeback(page); + folio_start_writeback(folio); - copy_highpage(tmp_page, page); + copy_highpage(tmp_page, &folio->page); ap->pages[ap->num_pages] = tmp_page; ap->descs[ap->num_pages].offset = 0; ap->descs[ap->num_pages].length = PAGE_SIZE; - data->orig_pages[ap->num_pages] = page; + data->orig_pages[ap->num_pages] = &folio->page; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); @@ -2266,13 +2305,13 @@ static int fuse_writepages_fill(struct page *page, spin_lock(&fi->lock); ap->num_pages++; spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, page)) { + } else if (fuse_writepage_add(wpa, &folio->page)) { data->wpa = wpa; } else { - end_page_writeback(page); + folio_end_writeback(folio); } out_unlock: - unlock_page(page); + folio_unlock(folio); return err; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c673faefdcb9..9b7fc7d3c7f1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -249,8 +249,9 @@ struct fuse_page_desc { struct fuse_args { uint64_t nodeid; uint32_t opcode; - unsigned short in_numargs; - unsigned short out_numargs; + uint8_t in_numargs; + uint8_t out_numargs; + uint8_t ext_idx; bool force:1; bool noreply:1; bool nocreds:1; @@ -261,6 +262,7 @@ struct fuse_args { bool page_zeroing:1; bool page_replace:1; bool may_block:1; + bool is_ext:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); @@ -781,6 +783,9 @@ struct fuse_conn { /* Initialize security xattrs when creating a new inode */ unsigned int init_security:1; + /* Add supplementary group info when creating a new inode */ + unsigned int create_supp_group:1; + /* Does the filesystem support per inode DAX? */ unsigned int inode_dax:1; @@ -1264,12 +1269,12 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); int fuse_removexattr(struct inode *inode, const char *name); extern const struct xattr_handler *fuse_xattr_handlers[]; -extern const struct xattr_handler *fuse_acl_xattr_handlers[]; -extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; -struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu); -int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu); +struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type); +int fuse_set_acl(struct mnt_idmap *, struct dentry *dentry, struct posix_acl *acl, int type); /* readdir.c */ @@ -1309,7 +1314,7 @@ long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int fuse_fileattr_set(struct user_namespace *mnt_userns, +int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* file.c */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6b3beda16c1b..d66070af145d 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -311,7 +311,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_dax_dontcache(inode, attr->flags); } -static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) +static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, + struct fuse_conn *fc) { inode->i_mode = attr->mode & S_IFMT; inode->i_size = attr->size; @@ -333,6 +334,12 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) new_decode_dev(attr->rdev)); } else BUG(); + /* + * Ensure that we don't cache acls for daemons without FUSE_POSIX_ACL + * so they see the exact same behavior as before. + */ + if (!fc->posix_acl) + inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; } static int fuse_inode_eq(struct inode *inode, void *_nodeidp) @@ -372,7 +379,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, if (!inode) return NULL; - fuse_init_inode(inode, attr); + fuse_init_inode(inode, attr, fc); get_fuse_inode(inode)->nodeid = nodeid; inode->i_flags |= S_AUTOMOUNT; goto done; @@ -388,7 +395,7 @@ retry: if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; - fuse_init_inode(inode, attr); + fuse_init_inode(inode, attr, fc); unlock_new_inode(inode); } else if (fuse_stale_inode(inode, generation, attr)) { /* nodeid was reused, any I/O on the old inode should fail */ @@ -1174,7 +1181,6 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, if ((flags & FUSE_POSIX_ACL)) { fc->default_permissions = 1; fc->posix_acl = 1; - fm->sb->s_xattr = fuse_acl_xattr_handlers; } if (flags & FUSE_CACHE_SYMLINKS) fc->cache_symlinks = 1; @@ -1201,6 +1207,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->setxattr_ext = 1; if (flags & FUSE_SECURITY_CTX) fc->init_security = 1; + if (flags & FUSE_CREATE_SUPP_GROUP) + fc->create_supp_group = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1246,7 +1254,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | - FUSE_SECURITY_CTX; + FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1420,13 +1428,6 @@ static void fuse_sb_defaults(struct super_block *sb) if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - - /* - * If we are not in the initial user namespace posix - * acls must be translated. - */ - if (sb->s_user_ns != &init_user_ns) - sb->s_xattr = fuse_no_acl_xattr_handlers; } static int fuse_fill_super_submount(struct super_block *sb, diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index fcce94ace2c2..8e01bfdfc430 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -419,6 +419,12 @@ static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode) struct fuse_mount *fm = get_fuse_mount(inode); bool isdir = S_ISDIR(inode->i_mode); + if (!fuse_allow_current_process(fm->fc)) + return ERR_PTR(-EACCES); + + if (fuse_is_bad(inode)) + return ERR_PTR(-EIO); + if (!S_ISREG(inode->i_mode) && !isdir) return ERR_PTR(-ENOTTY); @@ -467,7 +473,7 @@ cleanup: return err; } -int fuse_fileattr_set(struct user_namespace *mnt_userns, +int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 0d3e7177fce0..49c01559580f 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -189,7 +189,7 @@ static int fuse_xattr_get(const struct xattr_handler *handler, } static int fuse_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -203,27 +203,6 @@ static int fuse_xattr_set(const struct xattr_handler *handler, return fuse_setxattr(inode, name, value, size, flags, 0); } -static bool no_xattr_list(struct dentry *dentry) -{ - return false; -} - -static int no_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *value, size_t size) -{ - return -EOPNOTSUPP; -} - -static int no_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, - struct dentry *dentry, struct inode *nodee, - const char *name, const void *value, - size_t size, int flags) -{ - return -EOPNOTSUPP; -} - static const struct xattr_handler fuse_xattr_handler = { .prefix = "", .get = fuse_xattr_get, @@ -234,33 +213,3 @@ const struct xattr_handler *fuse_xattr_handlers[] = { &fuse_xattr_handler, NULL }; - -const struct xattr_handler *fuse_acl_xattr_handlers[] = { - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, - &fuse_xattr_handler, - NULL -}; - -static const struct xattr_handler fuse_no_acl_access_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = no_xattr_list, - .get = no_xattr_get, - .set = no_xattr_set, -}; - -static const struct xattr_handler fuse_no_acl_default_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = ACL_TYPE_ACCESS, - .list = no_xattr_list, - .get = no_xattr_get, - .set = no_xattr_set, -}; - -const struct xattr_handler *fuse_no_acl_xattr_handlers[] = { - &fuse_no_acl_access_xattr_handler, - &fuse_no_acl_default_xattr_handler, - &fuse_xattr_handler, - NULL -}; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 3dcde4912413..a392aa0f041d 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -109,7 +109,7 @@ out: return error; } -int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct inode *inode = d_inode(dentry); @@ -135,7 +135,7 @@ int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl); + ret = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (ret) goto unlock; } diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index b8de8c148f5c..d4deb2b19959 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -13,7 +13,7 @@ extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index e782b4f1d104..a5f4be6b9213 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -37,10 +37,10 @@ #include "aops.h" -void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, - unsigned int from, unsigned int len) +void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, + unsigned int from, unsigned int len) { - struct buffer_head *head = page_buffers(page); + struct buffer_head *head = folio_buffers(folio); unsigned int bsize = head->b_size; struct buffer_head *bh; unsigned int to = from + len; @@ -127,7 +127,6 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w { struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); if (PageChecked(page)) { ClearPageChecked(page); @@ -135,7 +134,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w create_empty_buffers(page, inode->i_sb->s_blocksize, BIT(BH_Dirty)|BIT(BH_Uptodate)); } - gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize); + gfs2_trans_add_databufs(ip, page_folio(page), 0, PAGE_SIZE); } return gfs2_write_jdata_page(page, wbc); } @@ -195,67 +194,71 @@ static int gfs2_writepages(struct address_space *mapping, } /** - * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages + * gfs2_write_jdata_batch - Write back a folio batch's worth of folios * @mapping: The mapping * @wbc: The writeback control - * @pvec: The vector of pages - * @nr_pages: The number of pages to write + * @fbatch: The batch of folios * @done_index: Page index * * Returns: non-zero if loop should terminate, zero otherwise */ -static int gfs2_write_jdata_pagevec(struct address_space *mapping, +static int gfs2_write_jdata_batch(struct address_space *mapping, struct writeback_control *wbc, - struct pagevec *pvec, - int nr_pages, + struct folio_batch *fbatch, pgoff_t *done_index) { struct inode *inode = mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); - unsigned nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits); + unsigned nrblocks; int i; int ret; + int nr_pages = 0; + int nr_folios = folio_batch_count(fbatch); + + for (i = 0; i < nr_folios; i++) + nr_pages += folio_nr_pages(fbatch->folios[i]); + nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits); ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); if (ret < 0) return ret; - for(i = 0; i < nr_pages; i++) { - struct page *page = pvec->pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch->folios[i]; - *done_index = page->index; + *done_index = folio->index; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + folio_wait_writeback(folio); else goto continue_unlock; } - BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) + BUG_ON(folio_test_writeback(folio)); + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(inode)); - ret = __gfs2_jdata_writepage(page, wbc); + ret = __gfs2_jdata_writepage(&folio->page, wbc); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); + folio_unlock(folio); ret = 0; } else { @@ -268,7 +271,8 @@ continue_unlock: * not be suitable for data integrity * writeout). */ - *done_index = page->index + 1; + *done_index = folio->index + + folio_nr_pages(folio); ret = 1; break; } @@ -305,8 +309,8 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, { int ret = 0; int done = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + int nr_folios; pgoff_t writeback_index; pgoff_t index; pgoff_t end; @@ -315,7 +319,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, int range_whole = 0; xa_mark_t tag; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -341,17 +345,18 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); - if (nr_pages == 0) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) break; - ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, &done_index); + ret = gfs2_write_jdata_batch(mapping, wbc, &fbatch, + &done_index); if (ret) done = 1; if (ret > 0) ret = 0; - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h index ff9877a68780..09db1914425e 100644 --- a/fs/gfs2/aops.h +++ b/fs/gfs2/aops.h @@ -9,7 +9,7 @@ #include "incore.h" extern void adjust_fs_space(struct inode *inode); -extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, - unsigned int from, unsigned int len); +extern void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, + unsigned int from, unsigned int len); #endif /* __AOPS_DOT_H__ */ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e7537fd305dd..eedf6926c652 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -956,26 +956,40 @@ hole_found: goto out; } -static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, - unsigned len) +static struct folio * +gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) { + struct inode *inode = iter->inode; unsigned int blockmask = i_blocksize(inode) - 1; struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int blocks; + struct folio *folio; + int status; blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits; - return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); + status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); + if (status) + return ERR_PTR(status); + + folio = iomap_get_folio(iter, pos); + if (IS_ERR(folio)) + gfs2_trans_end(sdp); + return folio; } -static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, - unsigned copied, struct page *page) +static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos, + unsigned copied, struct folio *folio) { struct gfs2_trans *tr = current->journal_info; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - if (page && !gfs2_is_stuffed(ip)) - gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied); + if (!gfs2_is_stuffed(ip)) + gfs2_trans_add_databufs(ip, folio, offset_in_folio(folio, pos), + copied); + + folio_unlock(folio); + folio_put(folio); if (tr->tr_num_buf_new) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -983,9 +997,9 @@ static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, gfs2_trans_end(sdp); } -static const struct iomap_page_ops gfs2_iomap_page_ops = { - .page_prepare = gfs2_iomap_page_prepare, - .page_done = gfs2_iomap_page_done, +static const struct iomap_folio_ops gfs2_iomap_folio_ops = { + .get_folio = gfs2_iomap_get_folio, + .put_folio = gfs2_iomap_put_folio, }; static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, @@ -1061,7 +1075,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, } if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip)) - iomap->page_ops = &gfs2_iomap_page_ops; + iomap->folio_ops = &gfs2_iomap_folio_ops; return 0; out_trans_end: @@ -1277,7 +1291,7 @@ int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, /* * NOTE: Never call gfs2_block_zero_range with an open transaction because it * uses iomap write to perform its actions, which begin their own transactions - * (iomap_begin, page_prepare, etc.) + * (iomap_begin, get_folio, etc.) */ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 2e215e8c3c88..6fe9ca253b70 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -83,26 +83,8 @@ static int gfs2_dhash(const struct dentry *dentry, struct qstr *str) return 0; } -static int gfs2_dentry_delete(const struct dentry *dentry) -{ - struct gfs2_inode *ginode; - - if (d_really_is_negative(dentry)) - return 0; - - ginode = GFS2_I(d_inode(dentry)); - if (!gfs2_holder_initialized(&ginode->i_iopen_gh)) - return 0; - - if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags)) - return 1; - - return 0; -} - const struct dentry_operations gfs2_dops = { .d_revalidate = gfs2_drevalidate, .d_hash = gfs2_dhash, - .d_delete = gfs2_dentry_delete, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index eea5be4fbf0e..300844f50dcd 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/mount.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/gfs2_ondisk.h> #include <linux/falloc.h> #include <linux/swap.h> @@ -235,7 +236,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask) goto out; if (!IS_IMMUTABLE(inode)) { - error = gfs2_permission(&init_user_ns, inode, MAY_WRITE); + error = gfs2_permission(&nop_mnt_idmap, inode, MAY_WRITE); if (error) goto out; } @@ -273,7 +274,7 @@ out: return error; } -int gfs2_fileattr_set(struct user_namespace *mnt_userns, +int gfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 524f3c96b9a4..5adc7d85dbf3 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -67,7 +67,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, static struct dentry *gfs2_root; static struct workqueue_struct *glock_workqueue; -struct workqueue_struct *gfs2_delete_workqueue; static LIST_HEAD(lru_list); static atomic_t lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lru_lock); @@ -274,9 +273,8 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) struct address_space *mapping = gfs2_glock2aspace(gl); lockref_mark_dead(&gl->gl_lockref); - - gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); + gfs2_glock_remove_from_lru(gl); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); if (mapping) { truncate_inode_pages_final(mapping); @@ -883,6 +881,7 @@ void glock_set_object(struct gfs2_glock *gl, void *object) /** * glock_clear_object - clear the gl_object field of a glock * @gl: the glock + * @object: object the glock currently points at */ void glock_clear_object(struct gfs2_glock *gl, void *object) { @@ -892,8 +891,7 @@ void glock_clear_object(struct gfs2_glock *gl, void *object) prev_object = gl->gl_object; gl->gl_object = NULL; spin_unlock(&gl->gl_lockref.lock); - if (gfs2_assert_warn(gl->gl_name.ln_sbd, - prev_object == object || prev_object == NULL)) { + if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) { pr_warn("glock=%u/%llx\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number); @@ -977,6 +975,26 @@ static bool gfs2_try_evict(struct gfs2_glock *gl) return evicted; } +bool gfs2_queue_try_to_evict(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + if (test_and_set_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) + return false; + return queue_delayed_work(sdp->sd_delete_wq, + &gl->gl_delete, 0); +} + +static bool gfs2_queue_verify_evict(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + if (test_and_set_bit(GLF_VERIFY_EVICT, &gl->gl_flags)) + return false; + return queue_delayed_work(sdp->sd_delete_wq, + &gl->gl_delete, 5 * HZ); +} + static void delete_work_func(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); @@ -985,11 +1003,7 @@ static void delete_work_func(struct work_struct *work) struct inode *inode; u64 no_addr = gl->gl_name.ln_number; - spin_lock(&gl->gl_lockref.lock); - clear_bit(GLF_PENDING_DELETE, &gl->gl_flags); - spin_unlock(&gl->gl_lockref.lock); - - if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { + if (test_and_clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) { /* * If we can evict the inode, give the remote node trying to * delete the inode some time before verifying that the delete @@ -1008,22 +1022,28 @@ static void delete_work_func(struct work_struct *work) * step entirely. */ if (gfs2_try_evict(gl)) { - if (gfs2_queue_delete_work(gl, 5 * HZ)) + if (test_bit(SDF_DEACTIVATING, &sdp->sd_flags)) + goto out; + if (gfs2_queue_verify_evict(gl)) return; } goto out; } - inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, - GFS2_BLKST_UNLINKED); - if (IS_ERR(inode)) { - if (PTR_ERR(inode) == -EAGAIN && - (gfs2_queue_delete_work(gl, 5 * HZ))) + if (test_and_clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags)) { + inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, + GFS2_BLKST_UNLINKED); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -EAGAIN && + !test_bit(SDF_DEACTIVATING, &sdp->sd_flags) && + gfs2_queue_verify_evict(gl)) return; - } else { - d_prune_aliases(inode); - iput(inode); + } else { + d_prune_aliases(inode); + iput(inode); + } } + out: gfs2_glock_put(gl); } @@ -1985,26 +2005,26 @@ add_back_to_lru: static long gfs2_scan_glock_lru(int nr) { - struct gfs2_glock *gl; - LIST_HEAD(skipped); + struct gfs2_glock *gl, *next; LIST_HEAD(dispose); long freed = 0; spin_lock(&lru_lock); - while ((nr-- >= 0) && !list_empty(&lru_list)) { - gl = list_first_entry(&lru_list, struct gfs2_glock, gl_lru); - + list_for_each_entry_safe(gl, next, &lru_list, gl_lru) { + if (nr-- <= 0) + break; /* Test for being demotable */ if (!test_bit(GLF_LOCK, &gl->gl_flags)) { - list_move(&gl->gl_lru, &dispose); - atomic_dec(&lru_count); - freed++; - continue; + if (!spin_trylock(&gl->gl_lockref.lock)) + continue; + if (!gl->gl_lockref.count) { + list_move(&gl->gl_lru, &dispose); + atomic_dec(&lru_count); + freed++; + } + spin_unlock(&gl->gl_lockref.lock); } - - list_move(&gl->gl_lru, &skipped); } - list_splice(&skipped, &lru_list); if (!list_empty(&dispose)) gfs2_dispose_glock_lru(&dispose); spin_unlock(&lru_lock); @@ -2063,37 +2083,21 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) rhashtable_walk_exit(&iter); } -bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay) -{ - bool queued; - - spin_lock(&gl->gl_lockref.lock); - queued = queue_delayed_work(gfs2_delete_workqueue, - &gl->gl_delete, delay); - if (queued) - set_bit(GLF_PENDING_DELETE, &gl->gl_flags); - spin_unlock(&gl->gl_lockref.lock); - return queued; -} - void gfs2_cancel_delete_work(struct gfs2_glock *gl) { - if (cancel_delayed_work(&gl->gl_delete)) { - clear_bit(GLF_PENDING_DELETE, &gl->gl_flags); + clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags); + clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags); + if (cancel_delayed_work(&gl->gl_delete)) gfs2_glock_put(gl); - } -} - -bool gfs2_delete_work_queued(const struct gfs2_glock *gl) -{ - return test_bit(GLF_PENDING_DELETE, &gl->gl_flags); } static void flush_delete_work(struct gfs2_glock *gl) { if (gl->gl_name.ln_type == LM_TYPE_IOPEN) { + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + if (cancel_delayed_work(&gl->gl_delete)) { - queue_delayed_work(gfs2_delete_workqueue, + queue_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, 0); } } @@ -2102,7 +2106,7 @@ static void flush_delete_work(struct gfs2_glock *gl) void gfs2_flush_delete_work(struct gfs2_sbd *sdp) { glock_hash_walk(flush_delete_work, sdp); - flush_workqueue(gfs2_delete_workqueue); + flush_workqueue(sdp->sd_delete_wq); } /** @@ -2308,14 +2312,16 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'o'; if (test_bit(GLF_BLOCKING, gflags)) *p++ = 'b'; - if (test_bit(GLF_PENDING_DELETE, gflags)) - *p++ = 'P'; if (test_bit(GLF_FREEING, gflags)) *p++ = 'x'; if (test_bit(GLF_INSTANTIATE_NEEDED, gflags)) *p++ = 'n'; if (test_bit(GLF_INSTANTIATE_IN_PROG, gflags)) *p++ = 'N'; + if (test_bit(GLF_TRY_TO_EVICT, gflags)) + *p++ = 'e'; + if (test_bit(GLF_VERIFY_EVICT, gflags)) + *p++ = 'E'; *p = 0; return buf; } @@ -2465,18 +2471,9 @@ int __init gfs2_glock_init(void) rhashtable_destroy(&gl_hash_table); return -ENOMEM; } - gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", - WQ_MEM_RECLAIM | WQ_FREEZABLE, - 0); - if (!gfs2_delete_workqueue) { - destroy_workqueue(glock_workqueue); - rhashtable_destroy(&gl_hash_table); - return -ENOMEM; - } ret = register_shrinker(&glock_shrinker, "gfs2-glock"); if (ret) { - destroy_workqueue(gfs2_delete_workqueue); destroy_workqueue(glock_workqueue); rhashtable_destroy(&gl_hash_table); return ret; @@ -2493,7 +2490,6 @@ void gfs2_glock_exit(void) unregister_shrinker(&glock_shrinker); rhashtable_destroy(&gl_hash_table); destroy_workqueue(glock_workqueue); - destroy_workqueue(gfs2_delete_workqueue); } static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index f37ac087e2c1..1f1ba92c15a8 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -144,7 +144,6 @@ struct gfs2_glock_aspace { struct address_space mapping; }; -extern struct workqueue_struct *gfs2_delete_workqueue; static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) { struct gfs2_holder *gh; @@ -268,9 +267,8 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); -extern bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay); +extern bool gfs2_queue_try_to_evict(struct gfs2_glock *gl); extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); -extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl); extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index d78b61ecc1cd..4d99cc77a29b 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -39,7 +39,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page " "state 0x%lx\n", bh, (unsigned long long)bh->b_blocknr, bh->b_state, - bh->b_page->mapping, bh->b_page->flags); + bh->b_folio->mapping, bh->b_folio->flags); fs_err(sdp, "AIL glock %u:%llu mapping %p\n", gl->gl_name.ln_type, gl->gl_name.ln_number, gfs2_glock2aspace(gl)); @@ -193,7 +193,7 @@ static int rgrp_go_sync(struct gfs2_glock *gl) struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); int error; - if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) + if (!rgd || !test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return 0; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); @@ -222,9 +222,12 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); const unsigned bsize = sdp->sd_sb.sb_bsize; - loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; - loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; + loff_t start, end; + if (!rgd) + return; + start = (rgd->rd_addr * bsize) & PAGE_MASK; + end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; gfs2_rgrp_brelse(rgd); WARN_ON_ONCE(!(flags & DIO_METADATA)); truncate_inode_pages_range(mapping, start, end); @@ -645,23 +648,18 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) struct gfs2_inode *ip = gl->gl_object; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - if (!remote || sb_rdonly(sdp->sd_vfs)) + if (!remote || sb_rdonly(sdp->sd_vfs) || + test_bit(SDF_DEACTIVATING, &sdp->sd_flags)) return; if (gl->gl_demote_state == LM_ST_UNLOCKED && gl->gl_state == LM_ST_SHARED && ip) { gl->gl_lockref.count++; - if (!queue_delayed_work(gfs2_delete_workqueue, - &gl->gl_delete, 0)) + if (!gfs2_queue_try_to_evict(gl)) gl->gl_lockref.count--; } } -static int iopen_go_demote_ok(const struct gfs2_glock *gl) -{ - return !gfs2_delete_work_queued(gl); -} - /** * inode_go_free - wake up anyone waiting for dlm's unlock ast to free it * @gl: glock being freed @@ -767,7 +765,6 @@ const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, .go_callback = iopen_go_callback, .go_dump = inode_go_dump, - .go_demote_ok = iopen_go_demote_ok, .go_flags = GLOF_LRU | GLOF_NONDISK, .go_subclass = 1, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index c26765080f28..79485329118b 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -329,8 +329,9 @@ enum { GLF_LRU = 13, GLF_OBJECT = 14, /* Used only for tracing */ GLF_BLOCKING = 15, - GLF_PENDING_DELETE = 17, - GLF_FREEING = 18, /* Wait for glock to be freed */ + GLF_FREEING = 16, /* Wait for glock to be freed */ + GLF_TRY_TO_EVICT = 17, /* iopen glocks only */ + GLF_VERIFY_EVICT = 18, /* iopen glocks only */ }; struct gfs2_glock { @@ -605,6 +606,8 @@ enum { SDF_REMOTE_WITHDRAW = 13, /* Performing remote recovery */ SDF_WITHDRAW_RECOVERY = 14, /* Wait for journal recovery when we are withdrawing */ + SDF_DEACTIVATING = 15, + SDF_EVICTING = 16, }; enum gfs2_freeze_state { @@ -771,6 +774,10 @@ struct gfs2_sbd { struct completion sd_journal_ready; + /* Workqueue stuff */ + + struct workqueue_struct *sd_delete_wq; + /* Daemon stuff */ struct task_struct *sd_logd_process; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 614db3055c02..1291b5ee3584 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -225,6 +225,10 @@ fail: gfs2_glock_dq_uninit(&ip->i_iopen_gh); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); + if (ip->i_gl) { + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + } iget_failed(inode); return ERR_PTR(error); } @@ -320,7 +324,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, } if (!is_root) { - error = gfs2_permission(&init_user_ns, dir, MAY_EXEC); + error = gfs2_permission(&nop_mnt_idmap, dir, MAY_EXEC); if (error) goto out; } @@ -350,7 +354,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, { int error; - error = gfs2_permission(&init_user_ns, &dip->i_inode, + error = gfs2_permission(&nop_mnt_idmap, &dip->i_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -816,6 +820,10 @@ fail_gunlock3: fail_gunlock2: gfs2_glock_put(io_gl); fail_free_inode: + if (ip->i_gl) { + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + } gfs2_rs_deltree(&ip->i_res); gfs2_qa_put(ip); fail_free_acls: @@ -843,7 +851,7 @@ fail: /** * gfs2_create - Create a file - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: The directory in which to create the file * @dentry: The dentry of the new file * @mode: The mode of the new file @@ -852,7 +860,7 @@ fail: * Returns: errno */ -static int gfs2_create(struct user_namespace *mnt_userns, struct inode *dir, +static int gfs2_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl); @@ -960,7 +968,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) goto out_gunlock; - error = gfs2_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -1078,7 +1086,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, if (IS_APPEND(&dip->i_inode)) return -EPERM; - error = gfs2_permission(&init_user_ns, &dip->i_inode, + error = gfs2_permission(&nop_mnt_idmap, &dip->i_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -1207,7 +1215,7 @@ out_inodes: /** * gfs2_symlink - Create a symlink - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: The directory to create the symlink in * @dentry: The dentry to put the symlink in * @symname: The thing which the link points to @@ -1215,7 +1223,7 @@ out_inodes: * Returns: errno */ -static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int gfs2_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { unsigned int size; @@ -1229,7 +1237,7 @@ static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir, /** * gfs2_mkdir - Make a directory - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: The parent directory of the new one * @dentry: The dentry of the new directory * @mode: The mode of the new directory @@ -1237,7 +1245,7 @@ static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir, * Returns: errno */ -static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir)); @@ -1246,7 +1254,7 @@ static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir, /** * gfs2_mknod - Make a special file - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: The directory in which the special file will reside * @dentry: The dentry of the special file * @mode: The mode of the special file @@ -1254,7 +1262,7 @@ static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir, * */ -static int gfs2_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int gfs2_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0); @@ -1504,7 +1512,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, } } } else { - error = gfs2_permission(&init_user_ns, ndir, + error = gfs2_permission(&nop_mnt_idmap, ndir, MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -1541,7 +1549,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Check out the dir to be renamed */ if (dir_rename) { - error = gfs2_permission(&init_user_ns, d_inode(odentry), + error = gfs2_permission(&nop_mnt_idmap, d_inode(odentry), MAY_WRITE); if (error) goto out_gunlock; @@ -1705,13 +1713,13 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry, goto out_gunlock; if (S_ISDIR(old_mode)) { - error = gfs2_permission(&init_user_ns, odentry->d_inode, + error = gfs2_permission(&nop_mnt_idmap, odentry->d_inode, MAY_WRITE); if (error) goto out_gunlock; } if (S_ISDIR(new_mode)) { - error = gfs2_permission(&init_user_ns, ndentry->d_inode, + error = gfs2_permission(&nop_mnt_idmap, ndentry->d_inode, MAY_WRITE); if (error) goto out_gunlock; @@ -1766,7 +1774,7 @@ out: return error; } -static int gfs2_rename2(struct user_namespace *mnt_userns, struct inode *odir, +static int gfs2_rename2(struct mnt_idmap *idmap, struct inode *odir, struct dentry *odentry, struct inode *ndir, struct dentry *ndentry, unsigned int flags) { @@ -1841,7 +1849,7 @@ out: /** * gfs2_permission - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: The inode * @mask: The mask to be tested * @@ -1852,7 +1860,7 @@ out: * Returns: errno */ -int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, +int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct gfs2_inode *ip; @@ -1872,7 +1880,7 @@ int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) error = -EPERM; else - error = generic_permission(&init_user_ns, inode, mask); + error = generic_permission(&nop_mnt_idmap, inode, mask); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); @@ -1881,7 +1889,7 @@ int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr) { - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -1966,7 +1974,7 @@ out: /** * gfs2_setattr - Change attributes on an inode - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: The dentry which is changing * @attr: The structure describing the change * @@ -1976,7 +1984,7 @@ out: * Returns: errno */ -static int gfs2_setattr(struct user_namespace *mnt_userns, +static int gfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -1992,11 +2000,11 @@ static int gfs2_setattr(struct user_namespace *mnt_userns, if (error) goto out; - error = may_setattr(&init_user_ns, inode, attr->ia_valid); + error = may_setattr(&nop_mnt_idmap, inode, attr->ia_valid); if (error) goto error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) goto error; @@ -2007,7 +2015,7 @@ static int gfs2_setattr(struct user_namespace *mnt_userns, else { error = gfs2_setattr_simple(inode, attr); if (!error && attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, dentry, + error = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); } @@ -2022,7 +2030,7 @@ out: /** * gfs2_getattr - Read out an inode's attributes - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @path: Object to query * @stat: The inode's stats * @request_mask: Mask of STATX_xxx flags indicating the caller's interests @@ -2037,7 +2045,7 @@ out: * Returns: errno */ -static int gfs2_getattr(struct user_namespace *mnt_userns, +static int gfs2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -2066,7 +2074,7 @@ static int gfs2_getattr(struct user_namespace *mnt_userns, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 0264d514dda7..c8c5814e7295 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -99,7 +99,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip); extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); -extern int gfs2_permission(struct user_namespace *mnt_userns, +extern int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); @@ -111,7 +111,7 @@ extern const struct file_operations gfs2_file_fops_nolock; extern const struct file_operations gfs2_dir_fops_nolock; extern int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); -extern int gfs2_fileattr_set(struct user_namespace *mnt_userns, +extern int gfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); extern void gfs2_set_inode_flags(struct inode *inode); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 723639376ae2..d750d1128bed 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -80,6 +80,15 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) brelse(bd->bd_bh); } +static int __gfs2_writepage(struct folio *folio, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(&folio->page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + /** * gfs2_ail1_start_one - Start I/O on a transaction * @sdp: The superblock @@ -127,11 +136,11 @@ __acquires(&sdp->sd_ail_lock) continue; gl = bd->bd_gl; list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list); - mapping = bh->b_page->mapping; + mapping = bh->b_folio->mapping; if (!mapping) continue; spin_unlock(&sdp->sd_ail_lock); - ret = filemap_fdatawrite_wbc(mapping, wbc); + ret = write_cache_pages(mapping, wbc, __gfs2_writepage, mapping); if (need_resched()) { blk_finish_plug(plug); cond_resched(); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 3c41b864ee5b..924361fa510b 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -334,7 +334,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) void gfs2_remove_from_journal(struct buffer_head *bh, int meta) { - struct address_space *mapping = bh->b_page->mapping; + struct address_space *mapping = bh->b_folio->mapping; struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; struct gfs2_trans *tr = current->journal_info; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c0cf1d2d0ef5..6de901c3b89b 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1197,9 +1197,15 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name); + sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s", + WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname); + error = -ENOMEM; + if (!sdp->sd_delete_wq) + goto fail_free; + error = gfs2_sys_fs_add(sdp); if (error) - goto fail_free; + goto fail_delete_wq; gfs2_create_debugfs_file(sdp); @@ -1309,6 +1315,8 @@ fail_lm: fail_debug: gfs2_delete_debugfs_file(sdp); gfs2_sys_fs_del(sdp); +fail_delete_wq: + destroy_workqueue(sdp->sd_delete_wq); fail_free: free_sbd(sdp); sb->s_fs_info = NULL; @@ -1720,6 +1728,55 @@ static int gfs2_meta_init_fs_context(struct fs_context *fc) return 0; } +/** + * gfs2_evict_inodes - evict inodes cooperatively + * @sb: the superblock + * + * When evicting an inode with a zero link count, we are trying to upgrade the + * inode's iopen glock from SH to EX mode in order to determine if we can + * delete the inode. The other nodes are supposed to evict the inode from + * their caches if they can, and to poke the inode's inode glock if they cannot + * do so. Either behavior allows gfs2_upgrade_iopen_glock() to proceed + * quickly, but if the other nodes are not cooperating, the lock upgrading + * attempt will time out. Since inodes are evicted sequentially, this can add + * up quickly. + * + * Function evict_inodes() tries to keep the s_inode_list_lock list locked over + * a long time, which prevents other inodes from being evicted concurrently. + * This precludes the cooperative behavior we are looking for. This special + * version of evict_inodes() avoids that. + * + * Modeled after drop_pagecache_sb(). + */ +static void gfs2_evict_inodes(struct super_block *sb) +{ + struct inode *inode, *toput_inode = NULL; + struct gfs2_sbd *sdp = sb->s_fs_info; + + set_bit(SDF_EVICTING, &sdp->sd_flags); + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) && + !need_resched()) { + spin_unlock(&inode->i_lock); + continue; + } + atomic_inc(&inode->i_count); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + + iput(toput_inode); + toput_inode = inode; + + cond_resched(); + spin_lock(&sb->s_inode_list_lock); + } + spin_unlock(&sb->s_inode_list_lock); + iput(toput_inode); +} + static void gfs2_kill_sb(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; @@ -1735,6 +1792,18 @@ static void gfs2_kill_sb(struct super_block *sb) sdp->sd_root_dir = NULL; sdp->sd_master_dir = NULL; shrink_dcache_sb(sb); + + gfs2_evict_inodes(sb); + + /* + * Flush and then drain the delete workqueue here (via + * destroy_workqueue()) to ensure that any delete work that + * may be running will also see the SDF_DEACTIVATING flag. + */ + set_bit(SDF_DEACTIVATING, &sdp->sd_flags); + gfs2_flush_delete_work(sdp); + destroy_workqueue(sdp->sd_delete_wq); + kill_block_super(sb); } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index f602fb844951..3b9b76e980ad 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1879,7 +1879,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip */ ip = gl->gl_object; - if (ip || !gfs2_queue_delete_work(gl, 0)) + if (ip || !gfs2_queue_try_to_evict(gl)) gfs2_glock_put(gl); else found++; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 999cc146d708..a83fa62106f0 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -138,8 +138,10 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) return -EIO; error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); - if (error || gfs2_withdrawn(sdp)) + if (error) { + gfs2_consist(sdp); return error; + } if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { gfs2_consist(sdp); @@ -151,7 +153,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) gfs2_log_pointers_init(sdp, head.lh_blkno); error = gfs2_quota_init(sdp); - if (!error && !gfs2_withdrawn(sdp)) + if (!error && gfs2_withdrawn(sdp)) + error = -EIO; + if (!error) set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); return error; } @@ -529,7 +533,9 @@ void gfs2_make_fs_ro(struct gfs2_sbd *sdp) { int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - gfs2_flush_delete_work(sdp); + if (!test_bit(SDF_DEACTIVATING, &sdp->sd_flags)) + gfs2_flush_delete_work(sdp); + if (!log_write_allowed && current == sdp->sd_quotad_process) fs_warn(sdp, "The quotad daemon is withdrawing.\n"); else if (sdp->sd_quotad_process) @@ -933,6 +939,7 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) static int gfs2_drop_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); if (inode->i_nlink && gfs2_holder_initialized(&ip->i_iopen_gh)) { @@ -952,11 +959,17 @@ static int gfs2_drop_inode(struct inode *inode) struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; gfs2_glock_hold(gl); - if (!gfs2_queue_delete_work(gl, 0)) + if (!gfs2_queue_try_to_evict(gl)) gfs2_glock_queue_put(gl); return 0; } + /* + * No longer cache inodes when trying to evict them all. + */ + if (test_bit(SDF_EVICTING, &sdp->sd_flags)) + return 1; + return generic_drop_inode(inode); } @@ -1175,15 +1188,23 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) gfs2_glock_dq_wait(gh); /* - * If there are no other lock holders, we'll get the lock immediately. + * If there are no other lock holders, we will immediately get + * exclusive access to the iopen glock here. + * * Otherwise, the other nodes holding the lock will be notified about - * our locking request. If they don't have the inode open, they'll - * evict the cached inode and release the lock. Otherwise, if they - * poke the inode glock, we'll take this as an indication that they - * still need the iopen glock and that they'll take care of deleting - * the inode when they're done. As a last resort, if another node - * keeps holding the iopen glock without showing any activity on the - * inode glock, we'll eventually time out. + * our locking request. If they do not have the inode open, they are + * expected to evict the cached inode and release the lock, allowing us + * to proceed. + * + * Otherwise, if they cannot evict the inode, they are expected to poke + * the inode glock (note: not the iopen glock). We will notice that + * and stop waiting for the iopen glock immediately. The other node(s) + * are then expected to take care of deleting the inode when they no + * longer use it. + * + * As a last resort, if another node keeps holding the iopen glock + * without showing any activity on the inode glock, we will eventually + * time out and fail the iopen glock upgrade. * * Note that we're passing the LM_FLAG_TRY_1CB flag to the first * locking request as an optimization to notify lock holders as soon as @@ -1401,10 +1422,8 @@ static void gfs2_evict_inode(struct inode *inode) if (gfs2_rs_active(&ip->i_res)) gfs2_rs_deltree(&ip->i_res); - if (gfs2_holder_initialized(&gh)) { - glock_clear_object(ip->i_gl, ip); + if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); - } if (ret && ret != GLR_TRYFAILED && ret != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", ret); out: diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index d87ea98cf535..454dc2ff8b5e 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -87,6 +87,7 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) "Withdraw In Prog: %d\n" "Remote Withdraw: %d\n" "Withdraw Recovery: %d\n" + "Deactivating: %d\n" "sd_log_error: %d\n" "sd_log_flush_lock: %d\n" "sd_log_num_revoke: %u\n" @@ -115,6 +116,7 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) test_bit(SDF_WITHDRAW_IN_PROG, &f), test_bit(SDF_REMOTE_WITHDRAW, &f), test_bit(SDF_WITHDRAW_RECOVERY, &f), + test_bit(SDF_DEACTIVATING, &f), sdp->sd_log_error, rwsem_is_locked(&sdp->sd_log_flush_lock), sdp->sd_log_num_revoke, @@ -767,10 +769,10 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) wait_for_completion(&sdp->sd_kobj_unregister); } -static int gfs2_uevent(struct kobject *kobj, struct kobj_uevent_env *env) +static int gfs2_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { - struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); - struct super_block *s = sdp->sd_vfs; + const struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + const struct super_block *s = sdp->sd_vfs; add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 518c0677e12a..adf6d17cf033 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1225,7 +1225,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, } static int gfs2_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index 129926b5142d..d985066006d5 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -3,6 +3,7 @@ config HFS_FS tristate "Apple Macintosh file system support" depends on BLOCK select NLS + select LEGACY_DIRECT_IO help If you say Y here, you will be able to mount Macintosh-formatted floppy disks and hard drive partitions with full read-write access. diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index 2bd54efaf416..6341bb248247 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -121,7 +121,7 @@ static int hfs_xattr_get(const struct xattr_handler *handler, } static int hfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 2015e42e752a..6add6ebfef89 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -274,6 +274,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) tree->node_hash[hash] = node; tree->node_hash_cnt++; } else { + hfs_bnode_get(node2); spin_unlock(&tree->hash_lock); kfree(node); wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags)); diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 527f6e46cbe8..3e1e3dcf0b48 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -189,7 +189,7 @@ static int hfs_dir_release(struct inode *inode, struct file *file) * a directory and return a corresponding inode, given the inode for * the directory and the name (and its length) of the new file. */ -static int hfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int hfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -219,7 +219,7 @@ static int hfs_create(struct user_namespace *mnt_userns, struct inode *dir, * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ -static int hfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -280,7 +280,7 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) * new file/directory. * XXX: how do you handle must_be dir? */ -static int hfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int hfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 3f7e9bef9874..6d1878b99b30 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -486,7 +486,7 @@ void hfs_file_truncate(struct inode *inode) inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { struct address_space *mapping = inode->i_mapping; - void *fsdata; + void *fsdata = NULL; struct page *page; /* XXX: Can use generic_cont_expand? */ diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 68d0305880f7..49d02524e667 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -206,7 +206,7 @@ int hfs_write_begin(struct file *file, struct address_space *mapping, extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); extern int hfs_write_inode(struct inode *, struct writeback_control *); -extern int hfs_inode_setattr(struct user_namespace *, struct dentry *, +extern int hfs_inode_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 log_size, __be32 phys_size, u32 clump_size); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 3a155c1d810e..1f7bd068acf0 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -611,14 +611,14 @@ static int hfs_file_release(struct inode *inode, struct file *file) * correspond to the same HFS file. */ -int hfs_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); int error; - error = setattr_prepare(&init_user_ns, dentry, + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); /* basic permission checks */ if (error) return error; @@ -658,7 +658,7 @@ int hfs_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, current_time(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig index 7d4229aecec0..8034e7827a69 100644 --- a/fs/hfsplus/Kconfig +++ b/fs/hfsplus/Kconfig @@ -4,6 +4,7 @@ config HFSPLUS_FS depends on BLOCK select NLS select NLS_UTF8 + select LEGACY_DIRECT_IO help If you say Y here, you will be able to mount extended format Macintosh-formatted hard drive partitions with full read-write access. diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 84714bbccc12..56fb5f1312e7 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -434,7 +434,7 @@ out: return res; } -static int hfsplus_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int hfsplus_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); @@ -476,7 +476,7 @@ out: return res; } -static int hfsplus_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int hfsplus_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); @@ -517,19 +517,19 @@ out: return res; } -static int hfsplus_create(struct user_namespace *mnt_userns, struct inode *dir, +static int hfsplus_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return hfsplus_mknod(&init_user_ns, dir, dentry, mode, 0); + return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } -static int hfsplus_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - return hfsplus_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); + return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); } -static int hfsplus_rename(struct user_namespace *mnt_userns, +static int hfsplus_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 721f779b4ec3..7a542f3dbe50 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -554,7 +554,7 @@ void hfsplus_file_truncate(struct inode *inode) if (inode->i_size > hip->phys_size) { struct address_space *mapping = inode->i_mapping; struct page *page; - void *fsdata; + void *fsdata = NULL; loff_t size = inode->i_size; res = hfsplus_write_begin(NULL, mapping, size, 0, diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 6aa919e59483..7ededcb720c1 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -481,13 +481,13 @@ void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork); int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd); int hfsplus_cat_write_inode(struct inode *inode); -int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, +int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int hfsplus_fileattr_set(struct user_namespace *mnt_userns, +int hfsplus_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* ioctl.c */ diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 840577a0c1e7..abb91f5fae92 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -246,13 +246,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) return 0; } -static int hfsplus_setattr(struct user_namespace *mnt_userns, +static int hfsplus_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -270,13 +270,13 @@ static int hfsplus_setattr(struct user_namespace *mnt_userns, inode->i_mtime = inode->i_ctime = current_time(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } -int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, +int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -298,7 +298,7 @@ int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; } @@ -390,7 +390,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, return NULL; inode->i_ino = sbi->next_cnid++; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -655,7 +655,7 @@ int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int hfsplus_fileattr_set(struct user_namespace *mnt_userns, +int hfsplus_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 49891b12c415..58021e73c00b 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -257,7 +257,7 @@ end_attr_file_creation: int __hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags) { - int err = 0; + int err; struct hfs_find_data cat_fd; hfsplus_cat_entry entry; u16 cat_entry_flags, cat_entry_type; @@ -494,7 +494,7 @@ ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, __be32 xattr_record_type; u32 record_type; u16 record_length = 0; - ssize_t res = 0; + ssize_t res; if ((!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) || @@ -606,7 +606,7 @@ static inline int can_list(const char *xattr_name) static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, char *buffer, size_t size) { - ssize_t res = 0; + ssize_t res; struct inode *inode = d_inode(dentry); struct hfs_find_data fd; u16 entry_type; @@ -674,10 +674,9 @@ end_listxattr_finder_info: ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) { ssize_t err; - ssize_t res = 0; + ssize_t res; struct inode *inode = d_inode(dentry); struct hfs_find_data fd; - u16 key_len = 0; struct hfsplus_attr_key attr_key; char *strbuf; int xattr_name_len; @@ -719,7 +718,8 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) } for (;;) { - key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset); + u16 key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset); + if (key_len == 0 || key_len > fd.tree->max_key_len) { pr_err("invalid xattr key length: %d\n", key_len); res = -EIO; @@ -766,12 +766,12 @@ out: static int hfsplus_removexattr(struct inode *inode, const char *name) { - int err = 0; + int err; struct hfs_find_data cat_fd; u16 flags; u16 cat_entry_type; - int is_xattr_acl_deleted = 0; - int is_all_xattrs_deleted = 0; + int is_xattr_acl_deleted; + int is_all_xattrs_deleted; if (!HFSPLUS_SB(inode->i_sb)->attr_tree) return -EOPNOTSUPP; @@ -857,7 +857,7 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler, } static int hfsplus_osx_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index c1c7a16cbf21..90f68ec119cd 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -23,7 +23,7 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler, } static int hfsplus_security_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index e150372ec564..fdbaebc1c49a 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -22,7 +22,7 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler, } static int hfsplus_trusted_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index a6b60b153916..6464b6c3d58d 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -22,7 +22,7 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler, } static int hfsplus_user_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hostfs/Makefile b/fs/hostfs/Makefile index d5beaffad43b..587bcd6e50a3 100644 --- a/fs/hostfs/Makefile +++ b/fs/hostfs/Makefile @@ -8,4 +8,4 @@ hostfs-objs := hostfs_kern.o hostfs_user.o obj-y := obj-$(CONFIG_HOSTFS) += hostfs.o -include arch/um/scripts/Makefile.rules +include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 277468783fee..c18bb50c31b6 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -559,7 +559,7 @@ static int read_name(struct inode *ino, char *name) return 0; } -static int hostfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int hostfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -658,7 +658,7 @@ static int hostfs_unlink(struct inode *ino, struct dentry *dentry) return err; } -static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino, +static int hostfs_symlink(struct mnt_idmap *idmap, struct inode *ino, struct dentry *dentry, const char *to) { char *file; @@ -671,7 +671,7 @@ static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino, return err; } -static int hostfs_mkdir(struct user_namespace *mnt_userns, struct inode *ino, +static int hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino, struct dentry *dentry, umode_t mode) { char *file; @@ -696,7 +696,7 @@ static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) return err; } -static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int hostfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; @@ -734,7 +734,7 @@ static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int hostfs_rename2(struct user_namespace *mnt_userns, +static int hostfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -763,7 +763,7 @@ static int hostfs_rename2(struct user_namespace *mnt_userns, return err; } -static int hostfs_permission(struct user_namespace *mnt_userns, +static int hostfs_permission(struct mnt_idmap *idmap, struct inode *ino, int desired) { char *name; @@ -786,11 +786,11 @@ static int hostfs_permission(struct user_namespace *mnt_userns, err = access_file(name, r, w, x); __putname(name); if (!err) - err = generic_permission(&init_user_ns, ino, desired); + err = generic_permission(&nop_mnt_idmap, ino, desired); return err; } -static int hostfs_setattr(struct user_namespace *mnt_userns, +static int hostfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -800,7 +800,7 @@ static int hostfs_setattr(struct user_namespace *mnt_userns, int fd = HOSTFS_I(inode)->fd; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; @@ -857,7 +857,7 @@ static int hostfs_setattr(struct user_namespace *mnt_userns, attr->ia_size != i_size_read(inode)) truncate_setsize(inode, attr->ia_size); - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 167ec6884642..f5a2476c47bf 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -280,7 +280,7 @@ void hpfs_init_inode(struct inode *); void hpfs_read_inode(struct inode *); void hpfs_write_inode(struct inode *); void hpfs_write_inode_nolock(struct inode *); -int hpfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); +int hpfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); void hpfs_write_if_changed(struct inode *); void hpfs_evict_inode(struct inode *); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 82208cc28ebd..e50e92a42432 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -257,7 +257,7 @@ void hpfs_write_inode_nolock(struct inode *i) brelse(bh); } -int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int hpfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -275,7 +275,7 @@ int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) goto out_unlock; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) goto out_unlock; @@ -289,7 +289,7 @@ int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, hpfs_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); hpfs_write_inode(inode); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 15fc63276caa..69fb40b2c99a 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -20,7 +20,7 @@ static void hpfs_update_directory_times(struct inode *dir) hpfs_write_inode_nolock(dir); } -static int hpfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; @@ -129,7 +129,7 @@ bail: return err; } -static int hpfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { const unsigned char *name = dentry->d_name.name; @@ -217,7 +217,7 @@ bail: return err; } -static int hpfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { const unsigned char *name = dentry->d_name.name; @@ -292,7 +292,7 @@ bail: return err; } -static int hpfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symlink) { const unsigned char *name = dentry->d_name.name; @@ -512,7 +512,7 @@ const struct address_space_operations hpfs_symlink_aops = { .read_folio = hpfs_symlink_read_folio }; -static int hpfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int hpfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 790d2727141a..9062da6da567 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -132,7 +132,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; ret = seal_check_future_write(info->seals, vma); @@ -388,9 +388,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, { pte_t *ptep, pte; - ptep = huge_pte_offset(vma->vm_mm, addr, - huge_page_size(hstate_vma(vma))); - + ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma))); if (!ptep) return false; @@ -412,10 +410,12 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, */ static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) { + unsigned long offset = 0; + if (vma->vm_pgoff < start) - return (start - vma->vm_pgoff) << PAGE_SHIFT; - else - return 0; + offset = (start - vma->vm_pgoff) << PAGE_SHIFT; + + return vma->vm_start + offset; } static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) @@ -457,7 +457,7 @@ retry: v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + if (!hugetlb_vma_maps_page(vma, v_start, page)) continue; if (!hugetlb_vma_trylock_write(vma)) { @@ -473,8 +473,8 @@ retry: break; } - unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, - NULL, ZAP_FLAG_DROP_MARKER); + unmap_hugepage_range(vma, v_start, v_end, NULL, + ZAP_FLAG_DROP_MARKER); hugetlb_vma_unlock_write(vma); } @@ -507,10 +507,9 @@ retry: */ v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) - unmap_hugepage_range(vma, vma->vm_start + v_start, - v_end, NULL, - ZAP_FLAG_DROP_MARKER); + if (hugetlb_vma_maps_page(vma, v_start, page)) + unmap_hugepage_range(vma, v_start, v_end, NULL, + ZAP_FLAG_DROP_MARKER); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); hugetlb_vma_unlock_write(vma); @@ -540,8 +539,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, - NULL, zap_flags); + unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags); /* * Note that vma lock only exists for shared/non-private @@ -813,7 +811,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * as input to create an allocation policy. */ vma_init(&pseudo_vma, mm); - pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); + vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; for (index = start; index < end; index++) { @@ -821,8 +819,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here. */ - struct page *page; + struct folio *folio; unsigned long addr; + bool present; cond_resched(); @@ -846,48 +845,49 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - page = find_get_page(mapping, index); - if (page) { - put_page(page); + rcu_read_lock(); + present = page_cache_next_miss(mapping, index, 1) != index; + rcu_read_unlock(); + if (present) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); hugetlb_drop_vma_policy(&pseudo_vma); continue; } /* - * Allocate page without setting the avoid_reserve argument. + * Allocate folio without setting the avoid_reserve argument. * There certainly are no reserves associated with the * pseudo_vma. However, there could be shared mappings with * reserves for the file at the inode level. If we fallocate - * pages in these areas, we need to consume the reserves + * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - page = alloc_huge_page(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); - if (IS_ERR(page)) { + if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - error = PTR_ERR(page); + error = PTR_ERR(folio); goto out; } - clear_huge_page(page, addr, pages_per_huge_page(h)); - __SetPageUptodate(page); - error = hugetlb_add_to_page_cache(page, mapping, index); + clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { - restore_reserve_on_error(h, &pseudo_vma, addr, page); - put_page(page); + restore_reserve_on_error(h, &pseudo_vma, addr, folio); + folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; } mutex_unlock(&hugetlb_fault_mutex_table[hash]); - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); /* - * unlock_page because locked by hugetlb_add_to_page_cache() - * put_page() due to reference from alloc_huge_page() + * folio_unlock because locked by hugetlb_add_to_page_cache() + * folio_put() due to reference from alloc_hugetlb_folio() */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) @@ -898,7 +898,7 @@ out: return error; } -static int hugetlbfs_setattr(struct user_namespace *mnt_userns, +static int hugetlbfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -907,7 +907,7 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns, unsigned int ia_valid = attr->ia_valid; struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -924,7 +924,7 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns, hugetlb_vmtruncate(inode, newsize); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -980,7 +980,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; @@ -1019,7 +1019,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * File creation. Allocate an inode, and we're done.. */ -static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; @@ -1033,24 +1033,24 @@ static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return 0; } -static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry, + int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } -static int hugetlbfs_create(struct user_namespace *mnt_userns, +static int hugetlbfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); } -static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, +static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { @@ -1064,7 +1064,7 @@ static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, return finish_open_simple(file, 0); } -static int hugetlbfs_symlink(struct user_namespace *mnt_userns, +static int hugetlbfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { diff --git a/fs/init.c b/fs/init.c index 5c36adaa9b44..9684406a8416 100644 --- a/fs/init.c +++ b/fs/init.c @@ -157,7 +157,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) mode &= ~current_umask(); error = security_path_mknod(&path, dentry, mode, dev); if (!error) - error = vfs_mknod(mnt_user_ns(path.mnt), path.dentry->d_inode, + error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); done_path_create(&path, dentry); return error; @@ -167,7 +167,7 @@ int __init init_link(const char *oldname, const char *newname) { struct dentry *new_dentry; struct path old_path, new_path; - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; int error; error = kern_path(oldname, 0, &old_path); @@ -182,14 +182,14 @@ int __init init_link(const char *oldname, const char *newname) error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - mnt_userns = mnt_user_ns(new_path.mnt); - error = may_linkat(mnt_userns, &old_path); + idmap = mnt_idmap(new_path.mnt); + error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode, + error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, NULL); out_dput: done_path_create(&new_path, new_dentry); @@ -209,7 +209,7 @@ int __init init_symlink(const char *oldname, const char *newname) return PTR_ERR(dentry); error = security_path_symlink(&path, dentry, oldname); if (!error) - error = vfs_symlink(mnt_user_ns(path.mnt), path.dentry->d_inode, + error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, oldname); done_path_create(&path, dentry); return error; @@ -233,7 +233,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) mode &= ~current_umask(); error = security_path_mkdir(&path, dentry, mode); if (!error) - error = vfs_mkdir(mnt_user_ns(path.mnt), path.dentry->d_inode, + error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); done_path_create(&path, dentry); return error; diff --git a/fs/inode.c b/fs/inode.c index f453eb58fd03..4558dc2f1355 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -5,6 +5,7 @@ */ #include <linux/export.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/mm.h> #include <linux/backing-dev.h> #include <linux/hash.h> @@ -1893,7 +1894,7 @@ bool atime_needs_update(const struct path *path, struct inode *inode) /* Atime updates will likely cause i_uid and i_gid to be written * back improprely if their true value is unknown to the vfs. */ - if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode)) + if (HAS_UNMAPPED_ID(mnt_idmap(mnt), inode)) return false; if (IS_NOATIME(inode)) @@ -1953,7 +1954,7 @@ EXPORT_SYMBOL(touch_atime); * response to write or truncate. Return 0 if nothing has to be changed. * Negative value on error (change should be denied). */ -int dentry_needs_remove_privs(struct user_namespace *mnt_userns, +int dentry_needs_remove_privs(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_inode(dentry); @@ -1963,7 +1964,7 @@ int dentry_needs_remove_privs(struct user_namespace *mnt_userns, if (IS_NOSEC(inode)) return 0; - mask = setattr_should_drop_suidgid(mnt_userns, inode); + mask = setattr_should_drop_suidgid(idmap, inode); ret = security_inode_need_killpriv(dentry); if (ret < 0) return ret; @@ -1972,7 +1973,7 @@ int dentry_needs_remove_privs(struct user_namespace *mnt_userns, return mask; } -static int __remove_privs(struct user_namespace *mnt_userns, +static int __remove_privs(struct mnt_idmap *idmap, struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1982,7 +1983,7 @@ static int __remove_privs(struct user_namespace *mnt_userns, * Note we call this on write, so notify_change will not * encounter any conflicting delegations: */ - return notify_change(mnt_userns, dentry, &newattrs, NULL); + return notify_change(idmap, dentry, &newattrs, NULL); } static int __file_remove_privs(struct file *file, unsigned int flags) @@ -1995,7 +1996,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; - kill = dentry_needs_remove_privs(file_mnt_user_ns(file), dentry); + kill = dentry_needs_remove_privs(file_mnt_idmap(file), dentry); if (kill < 0) return kill; @@ -2003,7 +2004,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) if (flags & IOCB_NOWAIT) return -EAGAIN; - error = __remove_privs(file_mnt_user_ns(file), dentry, kill); + error = __remove_privs(file_mnt_idmap(file), dentry, kill); } if (!error) @@ -2279,21 +2280,21 @@ EXPORT_SYMBOL(init_special_inode); /** * inode_init_owner - Init uid,gid,mode for new inode according to posix standards - * @mnt_userns: User namespace of the mount the inode was created from + * @idmap: idmap of the mount the inode was created from * @inode: New inode * @dir: Directory inode * @mode: mode of the new inode * - * If the inode has been created through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions + * If the inode has been created through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions * and initializing i_uid and i_gid. On non-idmapped mounts or if permission - * checking is to be performed on the raw inode simply passs init_user_ns. + * checking is to be performed on the raw inode simply pass @nop_mnt_idmap. */ -void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, +void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode) { - inode_fsuid_set(inode, mnt_userns); + inode_fsuid_set(inode, idmap); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; @@ -2301,32 +2302,32 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, if (S_ISDIR(mode)) mode |= S_ISGID; } else - inode_fsgid_set(inode, mnt_userns); + inode_fsgid_set(inode, idmap); inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); /** * inode_owner_or_capable - check current task permissions to inode - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode being checked * * Return true if current either has CAP_FOWNER in a namespace with the * inode owner uid mapped, or owns the file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -bool inode_owner_or_capable(struct user_namespace *mnt_userns, +bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode) { vfsuid_t vfsuid; struct user_namespace *ns; - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return true; @@ -2458,7 +2459,7 @@ EXPORT_SYMBOL(current_time); /** * in_group_or_capable - check whether caller is CAP_FSETID privileged - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: inode to check * @vfsgid: the new/current vfsgid of @inode * @@ -2468,19 +2469,19 @@ EXPORT_SYMBOL(current_time); * * Return: true if the caller is sufficiently privileged, false if not. */ -bool in_group_or_capable(struct user_namespace *mnt_userns, +bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid) { if (vfsgid_in_group_p(vfsgid)) return true; - if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) return true; return false; } /** * mode_strip_sgid - handle the sgid bit for non-directories - * @mnt_userns: User namespace of the mount the inode was created from + * @idmap: idmap of the mount the inode was created from * @dir: parent directory inode * @mode: mode of the file to be created in @dir * @@ -2492,15 +2493,14 @@ bool in_group_or_capable(struct user_namespace *mnt_userns, * * Return: the new mode to use for the file */ -umode_t mode_strip_sgid(struct user_namespace *mnt_userns, +umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode) { if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) return mode; if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) return mode; - if (in_group_or_capable(mnt_userns, dir, - i_gid_into_vfsgid(mnt_userns, dir))) + if (in_group_or_capable(idmap, dir, i_gid_into_vfsgid(idmap, dir))) return mode; return mode & ~S_ISGID; } diff --git a/fs/internal.h b/fs/internal.h index a803cc3cf716..dc4eb91a577a 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -14,9 +14,9 @@ struct path; struct mount; struct shrink_control; struct fs_context; -struct user_namespace; struct pipe_inode_info; struct iov_iter; +struct mnt_idmap; /* * block/bdev.c @@ -63,7 +63,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); -int may_linkat(struct user_namespace *mnt_userns, const struct path *link); +int may_linkat(struct mnt_idmap *idmap, const struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); int do_mkdirat(int dfd, struct filename *name, umode_t mode); @@ -120,6 +120,7 @@ extern bool trylock_super(struct super_block *sb); struct super_block *user_get_super(dev_t, bool excl); void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); +int sb_init_dio_done_wq(struct super_block *sb); /* * open.c @@ -150,8 +151,8 @@ extern int vfs_open(const struct path *, struct file *); * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); -int dentry_needs_remove_privs(struct user_namespace *, struct dentry *dentry); -bool in_group_or_capable(struct user_namespace *mnt_userns, +int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); +bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); /* @@ -187,9 +188,6 @@ extern void mnt_pin_kill(struct mount *m); */ extern const struct dentry_operations ns_dentry_operations; -/* direct-io.c: */ -int sb_init_dio_done_wq(struct super_block *sb); - /* * fs/stat.c: */ @@ -234,7 +232,7 @@ ssize_t do_getxattr(struct mnt_idmap *idmap, int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_ctx *ctx); -int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode); +int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode); #ifdef CONFIG_FS_POSIX_ACL int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, @@ -261,5 +259,8 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po /* * fs/attr.c */ -int setattr_should_drop_sgid(struct user_namespace *mnt_userns, +int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); +struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); +struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); +void mnt_idmap_put(struct mnt_idmap *idmap); diff --git a/fs/ioctl.c b/fs/ioctl.c index 80ac36aea913..5b2481cd4750 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -651,7 +651,7 @@ static int fileattr_set_prepare(struct inode *inode, /** * vfs_fileattr_set - change miscellaneous file attributes - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @dentry: the object to change * @fa: fileattr pointer * @@ -665,7 +665,7 @@ static int fileattr_set_prepare(struct inode *inode, * * Return: 0 on success, or a negative error on failure. */ -int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -675,7 +675,7 @@ int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, if (!inode->i_op->fileattr_set) return -ENOIOCTLCMD; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; inode_lock(inode); @@ -693,7 +693,7 @@ int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, } err = fileattr_set_prepare(inode, &old_ma, fa); if (!err) - err = inode->i_op->fileattr_set(mnt_userns, dentry, fa); + err = inode->i_op->fileattr_set(idmap, dentry, fa); } inode_unlock(inode); @@ -714,7 +714,7 @@ static int ioctl_getflags(struct file *file, unsigned int __user *argp) static int ioctl_setflags(struct file *file, unsigned int __user *argp) { - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; struct fileattr fa; unsigned int flags; @@ -725,7 +725,7 @@ static int ioctl_setflags(struct file *file, unsigned int __user *argp) err = mnt_want_write_file(file); if (!err) { fileattr_fill_flags(&fa, flags); - err = vfs_fileattr_set(mnt_userns, dentry, &fa); + err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); } } @@ -746,7 +746,7 @@ static int ioctl_fsgetxattr(struct file *file, void __user *argp) static int ioctl_fssetxattr(struct file *file, void __user *argp) { - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; struct fileattr fa; int err; @@ -755,7 +755,7 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) if (!err) { err = mnt_want_write_file(file); if (!err) { - err = vfs_fileattr_set(mnt_userns, dentry, &fa); + err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); } } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 356193e44cf0..6f4c97a6d7e9 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -457,6 +457,33 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); +/** + * iomap_get_folio - get a folio reference for writing + * @iter: iteration structure + * @pos: start offset of write + * + * Returns a locked reference to the folio at @pos, or an error pointer if the + * folio could not be obtained. + */ +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) +{ + unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; + struct folio *folio; + + if (iter->flags & IOMAP_NOWAIT) + fgp |= FGP_NOWAIT; + + folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, + fgp, mapping_gfp_mask(iter->inode->i_mapping)); + if (folio) + return folio; + + if (iter->flags & IOMAP_NOWAIT) + return ERR_PTR(-EAGAIN); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(iomap_get_folio); + bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) { trace_iomap_release_folio(folio->mapping->host, folio_pos(folio), @@ -575,6 +602,30 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, return 0; } +static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, + size_t len) +{ + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; + + if (folio_ops && folio_ops->get_folio) + return folio_ops->get_folio(iter, pos, len); + else + return iomap_get_folio(iter, pos); +} + +static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, + struct folio *folio) +{ + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; + + if (folio_ops && folio_ops->put_folio) { + folio_ops->put_folio(iter->inode, pos, ret, folio); + } else { + folio_unlock(folio); + folio_put(folio); + } +} + static int iomap_write_begin_inline(const struct iomap_iter *iter, struct folio *folio) { @@ -587,15 +638,11 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, size_t len, struct folio **foliop) { - const struct iomap_page_ops *page_ops = iter->iomap.page_ops; + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); struct folio *folio; - unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; int status = 0; - if (iter->flags & IOMAP_NOWAIT) - fgp |= FGP_NOWAIT; - BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); @@ -606,18 +653,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, if (!mapping_large_folio_support(iter->inode->i_mapping)) len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); - if (page_ops && page_ops->page_prepare) { - status = page_ops->page_prepare(iter->inode, pos, len); - if (status) - return status; - } - - folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, - fgp, mapping_gfp_mask(iter->inode->i_mapping)); - if (!folio) { - status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; - goto out_no_page; - } + folio = __iomap_get_folio(iter, pos, len); + if (IS_ERR(folio)) + return PTR_ERR(folio); /* * Now we have a locked folio, before we do anything with it we need to @@ -629,9 +667,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, * could do the wrong thing here (zero a page range incorrectly or fail * to zero) and corrupt data. */ - if (page_ops && page_ops->iomap_valid) { - bool iomap_valid = page_ops->iomap_valid(iter->inode, - &iter->iomap); + if (folio_ops && folio_ops->iomap_valid) { + bool iomap_valid = folio_ops->iomap_valid(iter->inode, + &iter->iomap); if (!iomap_valid) { iter->iomap.flags |= IOMAP_F_STALE; status = 0; @@ -656,13 +694,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, return 0; out_unlock: - folio_unlock(folio); - folio_put(folio); + __iomap_put_folio(iter, pos, 0, folio); iomap_write_failed(iter->inode, pos, len); -out_no_page: - if (page_ops && page_ops->page_done) - page_ops->page_done(iter->inode, pos, 0, NULL); return status; } @@ -712,7 +746,6 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter, static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t copied, struct folio *folio) { - const struct iomap_page_ops *page_ops = iter->iomap.page_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t old_size = iter->inode->i_size; size_t ret; @@ -735,14 +768,10 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, i_size_write(iter->inode, pos + ret); iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } - folio_unlock(folio); + __iomap_put_folio(iter, pos, ret, folio); if (old_size < pos) pagecache_isize_extended(iter->inode, old_size, pos); - if (page_ops && page_ops->page_done) - page_ops->page_done(iter->inode, pos, ret, &folio->page); - folio_put(folio); - if (ret < len) iomap_write_failed(iter->inode, pos + ret, len - ret); return ret; @@ -1685,10 +1714,9 @@ done: * For unwritten space on the page, we need to start the conversion to * regular allocated space. */ -static int -iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) +static int iomap_do_writepage(struct folio *folio, + struct writeback_control *wbc, void *data) { - struct folio *folio = page_folio(page); struct iomap_writepage_ctx *wpc = data; struct inode *inode = folio->mapping->host; u64 end_pos, isize; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9804714b1751..f771001574d0 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -217,16 +217,10 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; - if (!(dio->flags & IOMAP_DIO_WRITE)) { - WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND); + if (!(dio->flags & IOMAP_DIO_WRITE)) return REQ_OP_READ; - } - - if (iomap->flags & IOMAP_F_ZONE_APPEND) - opflags |= REQ_OP_ZONE_APPEND; - else - opflags |= REQ_OP_WRITE; + opflags |= REQ_OP_WRITE; if (use_fua) opflags |= REQ_FUA; else diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 4810438b7856..b33155dd7001 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -63,16 +63,12 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) static void release_buffer_page(struct buffer_head *bh) { struct folio *folio; - struct page *page; if (buffer_dirty(bh)) goto nope; if (atomic_read(&bh->b_count) != 1) goto nope; - page = bh->b_page; - if (!page) - goto nope; - folio = page_folio(page); + folio = bh->b_folio; if (folio->mapping) goto nope; @@ -181,31 +177,6 @@ static int journal_wait_on_commit_record(journal_t *journal, return ret; } -/* - * write the filemap data using writepage() address_space_operations. - * We don't do block allocation here even for delalloc. We don't - * use writepages() because with delayed allocation we may be doing - * block allocation in writepages(). - */ -int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) -{ - struct address_space *mapping = jinode->i_vfs_inode->i_mapping; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = mapping->nrpages * 2, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, - }; - - /* - * submit the inode data buffers. We use writepage - * instead of writepages. Because writepages can do - * block allocation with delalloc. We need to write - * only allocated blocks here. - */ - return generic_writepages(mapping, &wbc); -} - /* Send all the data buffers related to an inode */ int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) { @@ -1040,7 +1011,7 @@ restart_loop: * already detached from the mapping and buffers cannot * get reused. */ - mapping = READ_ONCE(bh->b_page->mapping); + mapping = READ_ONCE(bh->b_folio->mapping); if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { clear_buffer_mapped(bh); clear_buffer_new(bh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2696f43e7239..e80c781731f8 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -89,7 +89,6 @@ EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); -EXPORT_SYMBOL(jbd2_journal_submit_inode_data_buffers); EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); @@ -2938,7 +2937,7 @@ repeat: } else { J_ASSERT_BH(bh, (atomic_read(&bh->b_count) > 0) || - (bh->b_page && bh->b_page->mapping)); + (bh->b_folio && bh->b_folio->mapping)); if (!new_jh) { jbd_unlock_bh_journal_head(bh); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6a404ac1c178..15de1385012e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1010,36 +1010,28 @@ repeat: * ie. locked but not dirty) or tune2fs (which may actually have * the buffer dirtied, ugh.) */ - if (buffer_dirty(bh)) { + if (buffer_dirty(bh) && jh->b_transaction) { + warn_dirty_buffer(bh); /* - * First question: is this buffer already part of the current - * transaction or the existing committing transaction? - */ - if (jh->b_transaction) { - J_ASSERT_JH(jh, - jh->b_transaction == transaction || - jh->b_transaction == - journal->j_committing_transaction); - if (jh->b_next_transaction) - J_ASSERT_JH(jh, jh->b_next_transaction == - transaction); - warn_dirty_buffer(bh); - } - /* - * In any case we need to clean the dirty flag and we must - * do it under the buffer lock to be sure we don't race - * with running write-out. + * We need to clean the dirty flag and we must do it under the + * buffer lock to be sure we don't race with running write-out. */ JBUFFER_TRACE(jh, "Journalling dirty buffer"); clear_buffer_dirty(bh); + /* + * The buffer is going to be added to BJ_Reserved list now and + * nothing guarantees jbd2_journal_dirty_metadata() will be + * ever called for it. So we need to set jbddirty bit here to + * make sure the buffer is dirtied and written out when the + * journaling machinery is done with it. + */ set_buffer_jbddirty(bh); } - unlock_buffer(bh); - error = -EROFS; if (is_handle_aborted(handle)) { spin_unlock(&jh->b_state_lock); + unlock_buffer(bh); goto out; } error = 0; @@ -1049,8 +1041,10 @@ repeat: * b_next_transaction points to it */ if (jh->b_transaction == transaction || - jh->b_next_transaction == transaction) + jh->b_next_transaction == transaction) { + unlock_buffer(bh); goto done; + } /* * this is the first time this transaction is touching this buffer, @@ -1074,10 +1068,24 @@ repeat: */ smp_wmb(); spin_lock(&journal->j_list_lock); + if (test_clear_buffer_dirty(bh)) { + /* + * Execute buffer dirty clearing and jh->b_transaction + * assignment under journal->j_list_lock locked to + * prevent bh being removed from checkpoint list if + * the buffer is in an intermediate state (not dirty + * and jh->b_transaction is NULL). + */ + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + set_buffer_jbddirty(bh); + } __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); spin_unlock(&journal->j_list_lock); + unlock_buffer(bh); goto done; } + unlock_buffer(bh); + /* * If there is already a copy-out version of this buffer, then we don't * need to make another one diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 8bb58ce5c06c..888a7ceb6479 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -229,7 +229,7 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a return rc; } -int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int jffs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int rc, xprefix; @@ -241,7 +241,7 @@ int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (acl) { umode_t mode; - rc = posix_acl_update_mode(&init_user_ns, inode, &mode, + rc = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (rc) return rc; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index ca36a6eca594..e976b8cb82cf 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -28,7 +28,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu); -int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int jffs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index f399b390b5f6..5075a0a6d594 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -24,20 +24,20 @@ static int jffs2_readdir (struct file *, struct dir_context *); -static int jffs2_create (struct user_namespace *, struct inode *, +static int jffs2_create (struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); static struct dentry *jffs2_lookup (struct inode *,struct dentry *, unsigned int); static int jffs2_link (struct dentry *,struct inode *,struct dentry *); static int jffs2_unlink (struct inode *,struct dentry *); -static int jffs2_symlink (struct user_namespace *, struct inode *, +static int jffs2_symlink (struct mnt_idmap *, struct inode *, struct dentry *, const char *); -static int jffs2_mkdir (struct user_namespace *, struct inode *,struct dentry *, +static int jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *, umode_t); static int jffs2_rmdir (struct inode *,struct dentry *); -static int jffs2_mknod (struct user_namespace *, struct inode *,struct dentry *, +static int jffs2_mknod (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); -static int jffs2_rename (struct user_namespace *, struct inode *, +static int jffs2_rename (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); @@ -160,7 +160,7 @@ static int jffs2_readdir(struct file *file, struct dir_context *ctx) /***********************************************************************/ -static int jffs2_create(struct user_namespace *mnt_userns, struct inode *dir_i, +static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i, struct dentry *dentry, umode_t mode, bool excl) { struct jffs2_raw_inode *ri; @@ -279,7 +279,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de /***********************************************************************/ -static int jffs2_symlink (struct user_namespace *mnt_userns, struct inode *dir_i, +static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i, struct dentry *dentry, const char *target) { struct jffs2_inode_info *f, *dir_f; @@ -442,7 +442,7 @@ static int jffs2_symlink (struct user_namespace *mnt_userns, struct inode *dir_i } -static int jffs2_mkdir (struct user_namespace *mnt_userns, struct inode *dir_i, +static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, struct dentry *dentry, umode_t mode) { struct jffs2_inode_info *f, *dir_f; @@ -614,7 +614,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) return ret; } -static int jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i, +static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i, struct dentry *dentry, umode_t mode, dev_t rdev) { struct jffs2_inode_info *f, *dir_f; @@ -762,7 +762,7 @@ static int jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i, return ret; } -static int jffs2_rename (struct user_namespace *mnt_userns, +static int jffs2_rename (struct mnt_idmap *idmap, struct inode *old_dir_i, struct dentry *old_dentry, struct inode *new_dir_i, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index e952b0bd548b..038516bee1ab 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -190,19 +190,19 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) return 0; } -int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int jffs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int rc; - rc = setattr_prepare(&init_user_ns, dentry, iattr); + rc = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (rc) return rc; rc = jffs2_do_setattr(inode, iattr); if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + rc = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); return rc; } diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 921d782583d6..8da19766c101 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -164,7 +164,7 @@ long jffs2_ioctl(struct file *, unsigned int, unsigned long); extern const struct inode_operations jffs2_symlink_inode_operations; /* fs.c */ -int jffs2_setattr (struct user_namespace *, struct dentry *, struct iattr *); +int jffs2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); int jffs2_do_setattr (struct inode *, struct iattr *); struct inode *jffs2_iget(struct super_block *, unsigned long); void jffs2_evict_inode (struct inode *); diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index aef5522551db..437f3a2c1b54 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -57,7 +57,7 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler, } static int jffs2_security_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index cc3f24883e7d..b7c5da2d89bd 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -25,7 +25,7 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler, } static int jffs2_trusted_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index fb945977c013..f64edce4927b 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -25,7 +25,7 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler, } static int jffs2_user_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig index 05cb0e8e4382..51e856f0e4b8 100644 --- a/fs/jfs/Kconfig +++ b/fs/jfs/Kconfig @@ -3,6 +3,7 @@ config JFS_FS tristate "JFS filesystem support" select NLS select CRC32 + select LEGACY_DIRECT_IO help This is a port of IBM's Journaled Filesystem . More information is available in the file <file:Documentation/admin-guide/jfs.rst>. diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 3b667eccc73b..fb96f872d207 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -94,7 +94,7 @@ out: return rc; } -int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int jfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int rc; @@ -106,7 +106,7 @@ int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, tid = txBegin(inode->i_sb, 0); mutex_lock(&JFS_IP(inode)->commit_mutex); if (type == ACL_TYPE_ACCESS && acl) { - rc = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl); + rc = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (rc) goto end_tx; if (mode != inode->i_mode) diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 88663465aecd..2ee35be49de1 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -85,24 +85,24 @@ static int jfs_release(struct inode *inode, struct file *file) return 0; } -int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int jfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int rc; - rc = setattr_prepare(&init_user_ns, dentry, iattr); + rc = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (rc) return rc; - if (is_quota_modification(mnt_userns, inode, iattr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, iattr)) { rc = dquot_initialize(inode); if (rc) return rc; } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - rc = dquot_transfer(mnt_userns, inode, iattr); + rc = dquot_transfer(&nop_mnt_idmap, inode, iattr); if (rc) return rc; } @@ -119,11 +119,11 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, jfs_truncate(inode); } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) - rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + rc = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); return rc; } diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 1e7b177ece60..ed7989bc2db1 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -70,7 +70,7 @@ int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int jfs_fileattr_set(struct user_namespace *mnt_userns, +int jfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index f0704a25835f..f892e54d0fcd 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -8,7 +8,7 @@ #ifdef CONFIG_JFS_POSIX_ACL struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu); -int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int jfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 765838578a72..a3eb1e826947 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -193,7 +193,8 @@ int dbMount(struct inode *ipbmap) bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); - if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) { + if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG || + bmp->db_agl2size < 0) { err = -EINVAL; goto err_release_metapage; } diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 59379089e939..9e1f02767201 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -64,7 +64,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) goto fail_put; } - inode_init_owner(&init_user_ns, inode, parent, mode); + inode_init_owner(&nop_mnt_idmap, inode, parent, mode); /* * New inodes need to save sane values on disk when * uid & gid mount options are used diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 7de961a81862..ea80661597ac 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -10,7 +10,7 @@ struct fid; extern struct inode *ialloc(struct inode *, umode_t); extern int jfs_fsync(struct file *, loff_t, loff_t, int); extern int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -extern int jfs_fileattr_set(struct user_namespace *mnt_userns, +extern int jfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); extern long jfs_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long); @@ -28,7 +28,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern void jfs_set_inode_flags(struct inode *); extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern int jfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); +extern int jfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern const struct address_space_operations jfs_aops; extern const struct inode_operations jfs_dir_inode_operations; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index a38d14eed047..b29d68b5eec5 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -59,7 +59,7 @@ static inline void free_ea_wmap(struct inode *inode) * RETURN: Errors from subroutines * */ -static int jfs_create(struct user_namespace *mnt_userns, struct inode *dip, +static int jfs_create(struct mnt_idmap *idmap, struct inode *dip, struct dentry *dentry, umode_t mode, bool excl) { int rc = 0; @@ -192,7 +192,7 @@ static int jfs_create(struct user_namespace *mnt_userns, struct inode *dip, * note: * EACCES: user needs search+write permission on the parent directory */ -static int jfs_mkdir(struct user_namespace *mnt_userns, struct inode *dip, +static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, struct dentry *dentry, umode_t mode) { int rc = 0; @@ -869,7 +869,7 @@ static int jfs_link(struct dentry *old_dentry, * an intermediate result whose length exceeds PATH_MAX [XPG4.2] */ -static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip, +static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, struct dentry *dentry, const char *name) { int rc; @@ -1059,7 +1059,7 @@ static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip, * * FUNCTION: rename a file or directory */ -static int jfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -1345,7 +1345,7 @@ static int jfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, * * FUNCTION: Create a special file (device) */ -static int jfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct jfs_inode_info *jfs_ip; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index f9273f6901c8..f817798fa1eb 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -932,7 +932,7 @@ static int jfs_xattr_get(const struct xattr_handler *handler, } static int jfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -951,7 +951,7 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler, } static int jfs_xattr_set_os2(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 935ef8cb02b2..ef00b5fe8cee 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -149,9 +149,6 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, if (kn_from == kn_to) return strlcpy(buf, "/", buflen); - if (!buf) - return -EINVAL; - common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) return -EINVAL; @@ -1200,7 +1197,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, return d_splice_alias(inode, dentry); } -static int kernfs_iop_mkdir(struct user_namespace *mnt_userns, +static int kernfs_iop_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -1238,7 +1235,7 @@ static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) return ret; } -static int kernfs_iop_rename(struct user_namespace *mnt_userns, +static int kernfs_iop_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index eac0f210299a..30494dcb0df3 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -107,7 +107,7 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) return ret; } -int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); @@ -120,7 +120,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, root = kernfs_root(kn); down_write(&root->kernfs_rwsem); - error = setattr_prepare(&init_user_ns, dentry, iattr); + error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) goto out; @@ -129,7 +129,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, goto out; /* this ignores size changes */ - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); out: up_write(&root->kernfs_rwsem); @@ -181,7 +181,7 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) set_nlink(inode, kn->dir.subdirs + 2); } -int kernfs_iop_getattr(struct user_namespace *mnt_userns, +int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -191,7 +191,7 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns, down_read(&root->kernfs_rwsem); kernfs_refresh_inode(kn, inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); up_read(&root->kernfs_rwsem); return 0; @@ -272,7 +272,7 @@ void kernfs_evict_inode(struct inode *inode) kernfs_put(kn); } -int kernfs_iop_permission(struct user_namespace *mnt_userns, +int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct kernfs_node *kn; @@ -287,7 +287,7 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns, down_read(&root->kernfs_rwsem); kernfs_refresh_inode(kn, inode); - ret = generic_permission(&init_user_ns, inode, mask); + ret = generic_permission(&nop_mnt_idmap, inode, mask); up_read(&root->kernfs_rwsem); return ret; @@ -324,7 +324,7 @@ static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, } static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) @@ -391,7 +391,7 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 9046d9f39e63..236c3a6113f1 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -127,11 +127,11 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; */ extern const struct xattr_handler *kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); -int kernfs_iop_permission(struct user_namespace *mnt_userns, +int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); -int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); -int kernfs_iop_getattr(struct user_namespace *mnt_userns, +int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); diff --git a/fs/ksmbd/Kconfig b/fs/ksmbd/Kconfig index e1fe17747ed6..7055cb5d2880 100644 --- a/fs/ksmbd/Kconfig +++ b/fs/ksmbd/Kconfig @@ -33,14 +33,16 @@ config SMB_SERVER in ksmbd-tools, available from https://github.com/cifsd-team/ksmbd-tools. More detail about how to run the ksmbd kernel server is - available via README file + available via the README file (https://github.com/cifsd-team/ksmbd-tools/blob/master/README). ksmbd kernel server includes support for auto-negotiation, Secure negotiate, Pre-authentication integrity, oplock/lease, compound requests, multi-credit, packet signing, RDMA(smbdirect), smb3 encryption, copy-offload, secure per-user session - establishment via NTLM or NTLMv2. + establishment via Kerberos or NTLMv2. + +if SMB_SERVER config SMB_SERVER_SMBDIRECT bool "Support for SMB Direct protocol" @@ -54,6 +56,8 @@ config SMB_SERVER_SMBDIRECT SMB Direct allows transferring SMB packets over RDMA. If unsure, say N. +endif + config SMB_SERVER_CHECK_CAP_NET_ADMIN bool "Enable check network administration capability" depends on SMB_SERVER diff --git a/fs/ksmbd/asn1.c b/fs/ksmbd/asn1.c index c03eba090368..cc6384f79675 100644 --- a/fs/ksmbd/asn1.c +++ b/fs/ksmbd/asn1.c @@ -208,9 +208,9 @@ int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen, return 0; } -int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen, - unsigned char tag, const void *value, - size_t vlen) +static int ksmbd_neg_token_alloc(void *context, size_t hdrlen, + unsigned char tag, const void *value, + size_t vlen) { struct ksmbd_conn *conn = context; @@ -223,17 +223,16 @@ int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen, return 0; } -int ksmbd_neg_token_targ_resp_token(void *context, size_t hdrlen, +int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { - struct ksmbd_conn *conn = context; - - conn->mechToken = kmalloc(vlen + 1, GFP_KERNEL); - if (!conn->mechToken) - return -ENOMEM; + return ksmbd_neg_token_alloc(context, hdrlen, tag, value, vlen); +} - memcpy(conn->mechToken, value, vlen); - conn->mechToken[vlen] = '\0'; - return 0; +int ksmbd_neg_token_targ_resp_token(void *context, size_t hdrlen, + unsigned char tag, const void *value, + size_t vlen) +{ + return ksmbd_neg_token_alloc(context, hdrlen, tag, value, vlen); } diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index fd0a288af299..5b10b03800c1 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -114,7 +114,7 @@ void ksmbd_conn_enqueue_request(struct ksmbd_work *work) if (conn->ops->get_cmd_val(work) != SMB2_CANCEL_HE) { requests_queue = &conn->requests; - work->syncronous = true; + work->synchronous = true; } if (requests_queue) { @@ -139,7 +139,7 @@ int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) spin_lock(&conn->request_lock); if (!work->multiRsp) { list_del_init(&work->request_entry); - if (work->syncronous == false) + if (!work->synchronous) list_del_init(&work->async_request_entry); ret = 0; } @@ -280,7 +280,7 @@ int ksmbd_conn_handler_loop(void *p) { struct ksmbd_conn *conn = (struct ksmbd_conn *)p; struct ksmbd_transport *t = conn->transport; - unsigned int pdu_size; + unsigned int pdu_size, max_allowed_pdu_size; char hdr_buf[4] = {0,}; int size; @@ -305,13 +305,26 @@ int ksmbd_conn_handler_loop(void *p) pdu_size = get_rfc1002_len(hdr_buf); ksmbd_debug(CONN, "RFC1002 header %u bytes\n", pdu_size); + if (conn->status == KSMBD_SESS_GOOD) + max_allowed_pdu_size = + SMB3_MAX_MSGSIZE + conn->vals->max_write_size; + else + max_allowed_pdu_size = SMB3_MAX_MSGSIZE; + + if (pdu_size > max_allowed_pdu_size) { + pr_err_ratelimited("PDU length(%u) exceeded maximum allowed pdu size(%u) on connection(%d)\n", + pdu_size, max_allowed_pdu_size, + conn->status); + break; + } + /* * Check if pdu size is valid (min : smb header size, * max : 0x00FFFFFF). */ if (pdu_size < __SMB2_HEADER_STRUCTURE_SIZE || pdu_size > MAX_STREAM_PROT_LEN) { - continue; + break; } /* 4 for rfc1002 length field */ diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index b6bd8311e6b4..fb8b2d566efb 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -106,7 +106,8 @@ struct ksmbd_startup_request { __u32 sub_auth[3]; /* Subauth value for Security ID */ __u32 smb2_max_credits; /* MAX credits */ __u32 smbd_max_io_size; /* smbd read write size */ - __u32 reserved[127]; /* Reserved room */ + __u32 max_connections; /* Number of maximum simultaneous connections */ + __u32 reserved[126]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; diff --git a/fs/ksmbd/ksmbd_work.h b/fs/ksmbd/ksmbd_work.h index 5ece58e40c97..3234f2cf6327 100644 --- a/fs/ksmbd/ksmbd_work.h +++ b/fs/ksmbd/ksmbd_work.h @@ -68,7 +68,7 @@ struct ksmbd_work { /* Request is encrypted */ bool encrypted:1; /* Is this SYNC or ASYNC ksmbd_work */ - bool syncronous:1; + bool synchronous:1; bool need_invalidate_rkey:1; unsigned int remote_key; diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c index 92b1603b5abe..1ca2aae4c299 100644 --- a/fs/ksmbd/mgmt/user_session.c +++ b/fs/ksmbd/mgmt/user_session.c @@ -25,20 +25,19 @@ static DECLARE_RWSEM(sessions_table_lock); struct ksmbd_session_rpc { int id; unsigned int method; - struct list_head list; }; static void free_channel_list(struct ksmbd_session *sess) { - struct channel *chann, *tmp; + struct channel *chann; + unsigned long index; - write_lock(&sess->chann_lock); - list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list, - chann_list) { - list_del(&chann->chann_list); + xa_for_each(&sess->ksmbd_chann_list, index, chann) { + xa_erase(&sess->ksmbd_chann_list, index); kfree(chann); } - write_unlock(&sess->chann_lock); + + xa_destroy(&sess->ksmbd_chann_list); } static void __session_rpc_close(struct ksmbd_session *sess, @@ -58,15 +57,14 @@ static void __session_rpc_close(struct ksmbd_session *sess, static void ksmbd_session_rpc_clear_list(struct ksmbd_session *sess) { struct ksmbd_session_rpc *entry; + long index; - while (!list_empty(&sess->rpc_handle_list)) { - entry = list_entry(sess->rpc_handle_list.next, - struct ksmbd_session_rpc, - list); - - list_del(&entry->list); + xa_for_each(&sess->rpc_handle_list, index, entry) { + xa_erase(&sess->rpc_handle_list, index); __session_rpc_close(sess, entry); } + + xa_destroy(&sess->rpc_handle_list); } static int __rpc_method(char *rpc_name) @@ -102,13 +100,13 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) entry = kzalloc(sizeof(struct ksmbd_session_rpc), GFP_KERNEL); if (!entry) - return -EINVAL; + return -ENOMEM; - list_add(&entry->list, &sess->rpc_handle_list); entry->method = method; entry->id = ksmbd_ipc_id_alloc(); if (entry->id < 0) goto free_entry; + xa_store(&sess->rpc_handle_list, entry->id, entry, GFP_KERNEL); resp = ksmbd_rpc_open(sess, entry->id); if (!resp) @@ -117,9 +115,9 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) kvfree(resp); return entry->id; free_id: + xa_erase(&sess->rpc_handle_list, entry->id); ksmbd_rpc_id_free(entry->id); free_entry: - list_del(&entry->list); kfree(entry); return -EINVAL; } @@ -128,24 +126,17 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id) { struct ksmbd_session_rpc *entry; - list_for_each_entry(entry, &sess->rpc_handle_list, list) { - if (entry->id == id) { - list_del(&entry->list); - __session_rpc_close(sess, entry); - break; - } - } + entry = xa_erase(&sess->rpc_handle_list, id); + if (entry) + __session_rpc_close(sess, entry); } int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id) { struct ksmbd_session_rpc *entry; - list_for_each_entry(entry, &sess->rpc_handle_list, list) { - if (entry->id == id) - return entry->method; - } - return 0; + entry = xa_load(&sess->rpc_handle_list, id); + return entry ? entry->method : 0; } void ksmbd_session_destroy(struct ksmbd_session *sess) @@ -190,21 +181,15 @@ int ksmbd_session_register(struct ksmbd_conn *conn, static int ksmbd_chann_del(struct ksmbd_conn *conn, struct ksmbd_session *sess) { - struct channel *chann, *tmp; - - write_lock(&sess->chann_lock); - list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list, - chann_list) { - if (chann->conn == conn) { - list_del(&chann->chann_list); - kfree(chann); - write_unlock(&sess->chann_lock); - return 0; - } - } - write_unlock(&sess->chann_lock); + struct channel *chann; + + chann = xa_erase(&sess->ksmbd_chann_list, (long)conn); + if (!chann) + return -ENOENT; - return -ENOENT; + kfree(chann); + + return 0; } void ksmbd_sessions_deregister(struct ksmbd_conn *conn) @@ -234,7 +219,7 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) return; sess_destroy: - if (list_empty(&sess->ksmbd_chann_list)) { + if (xa_empty(&sess->ksmbd_chann_list)) { xa_erase(&conn->sessions, sess->id); ksmbd_session_destroy(sess); } @@ -320,6 +305,9 @@ static struct ksmbd_session *__session_create(int protocol) struct ksmbd_session *sess; int ret; + if (protocol != CIFDS_SESSION_FLAG_SMB2) + return NULL; + sess = kzalloc(sizeof(struct ksmbd_session), GFP_KERNEL); if (!sess) return NULL; @@ -329,30 +317,20 @@ static struct ksmbd_session *__session_create(int protocol) set_session_flag(sess, protocol); xa_init(&sess->tree_conns); - INIT_LIST_HEAD(&sess->ksmbd_chann_list); - INIT_LIST_HEAD(&sess->rpc_handle_list); + xa_init(&sess->ksmbd_chann_list); + xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; - rwlock_init(&sess->chann_lock); - - switch (protocol) { - case CIFDS_SESSION_FLAG_SMB2: - ret = __init_smb2_session(sess); - break; - default: - ret = -EINVAL; - break; - } + ret = __init_smb2_session(sess); if (ret) goto error; ida_init(&sess->tree_conn_ida); - if (protocol == CIFDS_SESSION_FLAG_SMB2) { - down_write(&sessions_table_lock); - hash_add(sessions_table, &sess->hlist, sess->id); - up_write(&sessions_table_lock); - } + down_write(&sessions_table_lock); + hash_add(sessions_table, &sess->hlist, sess->id); + up_write(&sessions_table_lock); + return sess; error: diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h index 8934b8ee275b..b6a9e7a6aae4 100644 --- a/fs/ksmbd/mgmt/user_session.h +++ b/fs/ksmbd/mgmt/user_session.h @@ -21,7 +21,6 @@ struct ksmbd_file_table; struct channel { __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; struct ksmbd_conn *conn; - struct list_head chann_list; }; struct preauth_session { @@ -50,11 +49,10 @@ struct ksmbd_session { char sess_key[CIFS_KEY_SIZE]; struct hlist_node hlist; - rwlock_t chann_lock; - struct list_head ksmbd_chann_list; + struct xarray ksmbd_chann_list; struct xarray tree_conns; struct ida tree_conn_ida; - struct list_head rpc_handle_list; + struct xarray rpc_handle_list; __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; __u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE]; diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c index 0ae8d08d85a8..3507d8f89074 100644 --- a/fs/ksmbd/ndr.c +++ b/fs/ksmbd/ndr.c @@ -242,7 +242,7 @@ int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) return ret; if (da->version != 3 && da->version != 4) { - pr_err("v%d version is not supported\n", da->version); + ksmbd_debug(VFS, "v%d version is not supported\n", da->version); return -EINVAL; } @@ -251,7 +251,7 @@ int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) return ret; if (da->version != version2) { - pr_err("ndr version mismatched(version: %d, version2: %d)\n", + ksmbd_debug(VFS, "ndr version mismatched(version: %d, version2: %d)\n", da->version, version2); return -EINVAL; } @@ -338,7 +338,7 @@ static int ndr_encode_posix_acl_entry(struct ndr *n, struct xattr_smb_acl *acl) } int ndr_encode_posix_acl(struct ndr *n, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct inode *inode, struct xattr_smb_acl *acl, struct xattr_smb_acl *def_acl) @@ -374,11 +374,11 @@ int ndr_encode_posix_acl(struct ndr *n, if (ret) return ret; - vfsuid = i_uid_into_vfsuid(user_ns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); ret = ndr_write_int64(n, from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid))); if (ret) return ret; - vfsgid = i_gid_into_vfsgid(user_ns, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); ret = ndr_write_int64(n, from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid))); if (ret) return ret; @@ -457,7 +457,7 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) if (ret) return ret; if (acl->version != 4) { - pr_err("v%d version is not supported\n", acl->version); + ksmbd_debug(VFS, "v%d version is not supported\n", acl->version); return -EINVAL; } @@ -465,7 +465,7 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) if (ret) return ret; if (acl->version != version2) { - pr_err("ndr version mismatched(version: %d, version2: %d)\n", + ksmbd_debug(VFS, "ndr version mismatched(version: %d, version2: %d)\n", acl->version, version2); return -EINVAL; } diff --git a/fs/ksmbd/ndr.h b/fs/ksmbd/ndr.h index 60ca265d1bb0..f3c108c8cf4d 100644 --- a/fs/ksmbd/ndr.h +++ b/fs/ksmbd/ndr.h @@ -14,7 +14,7 @@ struct ndr { int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da); int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da); -int ndr_encode_posix_acl(struct ndr *n, struct user_namespace *user_ns, +int ndr_encode_posix_acl(struct ndr *n, struct mnt_idmap *idmap, struct inode *inode, struct xattr_smb_acl *acl, struct xattr_smb_acl *def_acl); int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl); diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index d7d47b82451d..2e54ded4d92c 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -1608,9 +1608,9 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp) { struct create_posix_rsp *buf; struct inode *inode = file_inode(fp->filp); - struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); - vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode); + struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); buf = (struct create_posix_rsp *)cc; memset(buf, 0, sizeof(struct create_posix_rsp)); diff --git a/fs/ksmbd/server.h b/fs/ksmbd/server.h index ac9d932f8c8a..db7278181760 100644 --- a/fs/ksmbd/server.h +++ b/fs/ksmbd/server.h @@ -41,6 +41,7 @@ struct ksmbd_server_config { unsigned int share_fake_fscaps; struct smb_sid domain_sid; unsigned int auth_mechs; + unsigned int max_connections; char *conf[SERVER_CONF_WORK_GROUP + 1]; }; diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 6e25ace36568..fbdde426dd01 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -149,15 +149,11 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, break; case SMB2_LOCK: { - int lock_count; + unsigned short lock_count; - /* - * smb2_lock request size is 48 included single - * smb2_lock_element structure size. - */ - lock_count = le16_to_cpu(((struct smb2_lock_req *)hdr)->LockCount) - 1; + lock_count = le16_to_cpu(((struct smb2_lock_req *)hdr)->LockCount); if (lock_count > 0) { - *off = __SMB2_HEADER_STRUCTURE_SIZE + 48; + *off = offsetof(struct smb2_lock_req, locks); *len = sizeof(struct smb2_lock_element) * lock_count; } break; @@ -412,20 +408,19 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) goto validate_credit; /* - * windows client also pad up to 8 bytes when compounding. - * If pad is longer than eight bytes, log the server behavior - * (once), since may indicate a problem but allow it and - * continue since the frame is parseable. + * SMB2 NEGOTIATE request will be validated when message + * handling proceeds. */ - if (clc_len < len) { - ksmbd_debug(SMB, - "cli req padded more than expected. Length %d not %d for cmd:%d mid:%llu\n", - len, clc_len, command, - le64_to_cpu(hdr->MessageId)); + if (command == SMB2_NEGOTIATE_HE) + goto validate_credit; + + /* + * Allow a message that padded to 8byte boundary. + */ + if (clc_len < len && (len - clc_len) < 8) goto validate_credit; - } - ksmbd_debug(SMB, + pr_err_ratelimited( "cli req too short, len %d not %d. cmd:%d mid:%llu\n", len, clc_len, command, le64_to_cpu(hdr->MessageId)); diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index e401302478c3..aed7704a0672 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -26,7 +26,7 @@ static struct smb_version_values smb21_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -52,7 +52,7 @@ static struct smb_version_values smb30_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -79,7 +79,7 @@ static struct smb_version_values smb302_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -106,7 +106,7 @@ static struct smb_version_values smb311_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 38fbda52e06f..0685c1c77b9f 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -12,6 +12,7 @@ #include <linux/ethtool.h> #include <linux/falloc.h> #include <linux/mount.h> +#include <linux/filelock.h> #include "glob.h" #include "smbfsctl.h" @@ -74,14 +75,7 @@ static inline bool check_session_id(struct ksmbd_conn *conn, u64 id) struct channel *lookup_chann_list(struct ksmbd_session *sess, struct ksmbd_conn *conn) { - struct channel *chann; - - list_for_each_entry(chann, &sess->ksmbd_chann_list, chann_list) { - if (chann->conn == conn) - return chann; - } - - return NULL; + return xa_load(&sess->ksmbd_chann_list, (long)conn); } /** @@ -280,8 +274,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) le16_to_cpu(rsp->SecurityBufferOffset)); inc_rfc1001_len(work->response_buf, sizeof(struct smb2_negotiate_rsp) - - sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) + - AUTH_GSS_LENGTH); + sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH); rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY) rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE; @@ -505,7 +498,7 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work) rsp_hdr->SessionId = rcv_hdr->SessionId; memcpy(rsp_hdr->Signature, rcv_hdr->Signature, 16); - work->syncronous = true; + work->synchronous = true; if (work->async_id) { ksmbd_release_id(&conn->async_ida, work->async_id); work->async_id = 0; @@ -595,6 +588,7 @@ static void destroy_previous_session(struct ksmbd_conn *conn, struct ksmbd_session *prev_sess = ksmbd_session_lookup_slowpath(id); struct ksmbd_user *prev_user; struct channel *chann; + long index; if (!prev_sess) return; @@ -608,10 +602,8 @@ static void destroy_previous_session(struct ksmbd_conn *conn, return; prev_sess->state = SMB2_SESSION_EXPIRED; - write_lock(&prev_sess->chann_lock); - list_for_each_entry(chann, &prev_sess->ksmbd_chann_list, chann_list) + xa_for_each(&prev_sess->ksmbd_chann_list, index, chann) chann->conn->status = KSMBD_SESS_EXITING; - write_unlock(&prev_sess->chann_lock); } /** @@ -652,7 +644,7 @@ int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg) pr_err("Failed to alloc async message id\n"); return id; } - work->syncronous = false; + work->synchronous = false; work->async_id = id; rsp_hdr->Id.AsyncId = cpu_to_le64(id); @@ -1212,8 +1204,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) + le16_to_cpu(rsp->SecurityBufferOffset)); inc_rfc1001_len(work->response_buf, sizeof(struct smb2_negotiate_rsp) - - sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) + - AUTH_GSS_LENGTH); + sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH); rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; conn->use_spnego = true; @@ -1519,19 +1510,14 @@ static int ntlm_authenticate(struct ksmbd_work *work) binding_session: if (conn->dialect >= SMB30_PROT_ID) { - read_lock(&sess->chann_lock); chann = lookup_chann_list(sess, conn); - read_unlock(&sess->chann_lock); if (!chann) { chann = kmalloc(sizeof(struct channel), GFP_KERNEL); if (!chann) return -ENOMEM; chann->conn = conn; - INIT_LIST_HEAD(&chann->chann_list); - write_lock(&sess->chann_lock); - list_add(&chann->chann_list, &sess->ksmbd_chann_list); - write_unlock(&sess->chann_lock); + xa_store(&sess->ksmbd_chann_list, (long)conn, chann, GFP_KERNEL); } } @@ -1606,19 +1592,14 @@ static int krb5_authenticate(struct ksmbd_work *work) } if (conn->dialect >= SMB30_PROT_ID) { - read_lock(&sess->chann_lock); chann = lookup_chann_list(sess, conn); - read_unlock(&sess->chann_lock); if (!chann) { chann = kmalloc(sizeof(struct channel), GFP_KERNEL); if (!chann) return -ENOMEM; chann->conn = conn; - INIT_LIST_HEAD(&chann->chann_list); - write_lock(&sess->chann_lock); - list_add(&chann->chann_list, &sess->ksmbd_chann_list); - write_unlock(&sess->chann_lock); + xa_store(&sess->ksmbd_chann_list, (long)conn, chann, GFP_KERNEL); } } @@ -2192,7 +2173,7 @@ out: static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, const struct path *path) { - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); char *attr_name = NULL, *value; int rc = 0; unsigned int next = 0; @@ -2228,7 +2209,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, value = (char *)&eabuf->name + eabuf->EaNameLength + 1; if (!eabuf->EaValueLength) { - rc = ksmbd_vfs_casexattr_len(user_ns, + rc = ksmbd_vfs_casexattr_len(idmap, path->dentry, attr_name, XATTR_USER_PREFIX_LEN + @@ -2236,7 +2217,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* delete the EA only when it exits */ if (rc > 0) { - rc = ksmbd_vfs_remove_xattr(user_ns, + rc = ksmbd_vfs_remove_xattr(idmap, path->dentry, attr_name); @@ -2251,7 +2232,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* if the EA doesn't exist, just do nothing. */ rc = 0; } else { - rc = ksmbd_vfs_setxattr(user_ns, + rc = ksmbd_vfs_setxattr(idmap, path->dentry, attr_name, value, le16_to_cpu(eabuf->EaValueLength), 0); if (rc < 0) { @@ -2281,7 +2262,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, struct ksmbd_file *fp, char *stream_name, int s_type) { - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); size_t xattr_stream_size; char *xattr_stream_name; int rc; @@ -2297,7 +2278,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, fp->stream.size = xattr_stream_size; /* Check if there is stream prefix in xattr space */ - rc = ksmbd_vfs_casexattr_len(user_ns, + rc = ksmbd_vfs_casexattr_len(idmap, path->dentry, xattr_stream_name, xattr_stream_size); @@ -2309,7 +2290,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, return -EBADF; } - rc = ksmbd_vfs_setxattr(user_ns, path->dentry, + rc = ksmbd_vfs_setxattr(idmap, path->dentry, xattr_stream_name, NULL, 0, 0); if (rc < 0) pr_err("Failed to store XATTR stream name :%d\n", rc); @@ -2318,7 +2299,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, static int smb2_remove_smb_xattrs(const struct path *path) { - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); char *name, *xattr_list = NULL; ssize_t xattr_list_len; int err = 0; @@ -2338,7 +2319,7 @@ static int smb2_remove_smb_xattrs(const struct path *path) if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && !strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN)) { - err = ksmbd_vfs_remove_xattr(user_ns, path->dentry, + err = ksmbd_vfs_remove_xattr(idmap, path->dentry, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", @@ -2385,7 +2366,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path * da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_user_ns(path->mnt), + rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path->dentry, &da); if (rc) ksmbd_debug(SMB, "failed to store file attribute into xattr\n"); @@ -2404,7 +2385,7 @@ static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon, KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) return; - rc = ksmbd_vfs_get_dos_attrib_xattr(mnt_user_ns(path->mnt), + rc = ksmbd_vfs_get_dos_attrib_xattr(mnt_idmap(path->mnt), path->dentry, &da); if (rc > 0) { fp->f_ci->m_fattr = cpu_to_le32(da.attr); @@ -2479,11 +2460,11 @@ static int smb2_create_sd_buffer(struct ksmbd_work *work, } static void ksmbd_acls_fattr(struct smb_fattr *fattr, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *inode) { - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); fattr->cf_uid = vfsuid_into_kuid(vfsuid); fattr->cf_gid = vfsgid_into_kgid(vfsgid); @@ -2515,7 +2496,7 @@ int smb2_open(struct ksmbd_work *work) struct ksmbd_share_config *share = tcon->share_conf; struct ksmbd_file *fp = NULL; struct file *filp = NULL; - struct user_namespace *user_ns = NULL; + struct mnt_idmap *idmap = NULL; struct kstat stat; struct create_context *context; struct lease_ctx_info *lc = NULL; @@ -2768,7 +2749,7 @@ int smb2_open(struct ksmbd_work *work) rc = 0; } else { file_present = true; - user_ns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); } if (stream_name) { if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) { @@ -2831,7 +2812,7 @@ int smb2_open(struct ksmbd_work *work) if (!file_present) { daccess = cpu_to_le32(GENERIC_ALL_FLAGS); } else { - rc = ksmbd_vfs_query_maximal_access(user_ns, + rc = ksmbd_vfs_query_maximal_access(idmap, path.dentry, &daccess); if (rc) @@ -2867,7 +2848,7 @@ int smb2_open(struct ksmbd_work *work) } created = true; - user_ns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); if (ea_buf) { if (le32_to_cpu(ea_buf->ccontext.DataLength) < sizeof(struct smb2_ea_info)) { @@ -2889,7 +2870,7 @@ int smb2_open(struct ksmbd_work *work) * is already granted. */ if (daccess & ~(FILE_READ_ATTRIBUTES_LE | FILE_READ_CONTROL_LE)) { - rc = inode_permission(user_ns, + rc = inode_permission(idmap, d_inode(path.dentry), may_flags); if (rc) @@ -2897,7 +2878,7 @@ int smb2_open(struct ksmbd_work *work) if ((daccess & FILE_DELETE_LE) || (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { - rc = ksmbd_vfs_may_delete(user_ns, + rc = ksmbd_vfs_may_delete(idmap, path.dentry); if (rc) goto err_out; @@ -2960,7 +2941,7 @@ int smb2_open(struct ksmbd_work *work) int posix_acl_rc; struct inode *inode = d_inode(path.dentry); - posix_acl_rc = ksmbd_vfs_inherit_posix_acl(user_ns, + posix_acl_rc = ksmbd_vfs_inherit_posix_acl(idmap, path.dentry, d_inode(path.dentry->d_parent)); if (posix_acl_rc) @@ -2976,7 +2957,7 @@ int smb2_open(struct ksmbd_work *work) rc = smb2_create_sd_buffer(work, req, &path); if (rc) { if (posix_acl_rc) - ksmbd_vfs_set_init_posix_acl(user_ns, + ksmbd_vfs_set_init_posix_acl(idmap, path.dentry); if (test_share_config_flag(work->tcon->share_conf, @@ -2985,7 +2966,7 @@ int smb2_open(struct ksmbd_work *work) struct smb_ntsd *pntsd; int pntsd_size, ace_num = 0; - ksmbd_acls_fattr(&fattr, user_ns, inode); + ksmbd_acls_fattr(&fattr, idmap, inode); if (fattr.cf_acls) ace_num = fattr.cf_acls->a_count; if (fattr.cf_dacls) @@ -2999,7 +2980,7 @@ int smb2_open(struct ksmbd_work *work) if (!pntsd) goto err_out; - rc = build_sec_desc(user_ns, + rc = build_sec_desc(idmap, pntsd, NULL, 0, OWNER_SECINFO | GROUP_SECINFO | @@ -3013,7 +2994,7 @@ int smb2_open(struct ksmbd_work *work) } rc = ksmbd_vfs_set_sd_xattr(conn, - user_ns, + idmap, path.dentry, pntsd, pntsd_size); @@ -3209,7 +3190,7 @@ int smb2_open(struct ksmbd_work *work) struct create_context *mxac_ccontext; if (maximal_access == 0) - ksmbd_vfs_query_maximal_access(user_ns, + ksmbd_vfs_query_maximal_access(idmap, path.dentry, &maximal_access); mxac_ccontext = (struct create_context *)(rsp->Buffer + @@ -3634,7 +3615,7 @@ static void unlock_dir(struct ksmbd_file *dir_fp) static int process_query_dir_entries(struct smb2_query_dir_private *priv) { - struct user_namespace *user_ns = file_mnt_user_ns(priv->dir_fp->filp); + struct mnt_idmap *idmap = file_mnt_idmap(priv->dir_fp->filp); struct kstat kstat; struct ksmbd_kstat ksmbd_kstat; int rc; @@ -3647,7 +3628,7 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) return -EINVAL; lock_dir(priv->dir_fp); - dent = lookup_one(user_ns, priv->d_info->name, + dent = lookup_one(idmap, priv->d_info->name, priv->dir_fp->filp->f_path.dentry, priv->d_info->name_len); unlock_dir(priv->dir_fp); @@ -3668,7 +3649,7 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) ksmbd_kstat.kstat = &kstat; if (priv->info_level != FILE_NAMES_INFORMATION) ksmbd_vfs_fill_dentry_attrs(priv->work, - user_ns, + idmap, dent, &ksmbd_kstat); @@ -3898,7 +3879,7 @@ int smb2_query_dir(struct ksmbd_work *work) } if (!(dir_fp->daccess & FILE_LIST_DIRECTORY_LE) || - inode_permission(file_mnt_user_ns(dir_fp->filp), + inode_permission(file_mnt_idmap(dir_fp->filp), file_inode(dir_fp->filp), MAY_READ | MAY_EXEC)) { pr_err("no right to enumerate directory (%pD)\n", dir_fp->filp); @@ -4164,7 +4145,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, ssize_t buf_free_len, alignment_bytes, next_offset, rsp_data_cnt = 0; struct smb2_ea_info_req *ea_req = NULL; const struct path *path; - struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); if (!(fp->daccess & FILE_READ_EA_LE)) { pr_err("Not permitted to read ext attr : 0x%x\n", @@ -4244,7 +4225,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, buf_free_len -= (offsetof(struct smb2_ea_info, name) + name_len + 1); /* bailout if xattr can't fit in buf_free_len */ - value_len = ksmbd_vfs_getxattr(user_ns, path->dentry, + value_len = ksmbd_vfs_getxattr(idmap, path->dentry, name, &buf); if (value_len <= 0) { rc = -ENOENT; @@ -4334,7 +4315,7 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, } basic_info = (struct smb2_file_basic_info *)rsp->Buffer; - generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), &stat); basic_info->CreationTime = cpu_to_le64(fp->create_time); time = ksmbd_UnixTimeToNT(stat.atime); @@ -4375,7 +4356,7 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp, struct kstat stat; inode = file_inode(fp->filp); - generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); sinfo = (struct smb2_file_standard_info *)rsp->Buffer; delete_pending = ksmbd_inode_pending_delete(fp); @@ -4429,7 +4410,7 @@ static int get_file_all_info(struct ksmbd_work *work, return PTR_ERR(filename); inode = file_inode(fp->filp); - generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); ksmbd_debug(SMB, "filename = %s\n", filename); delete_pending = ksmbd_inode_pending_delete(fp); @@ -4506,7 +4487,7 @@ static void get_file_stream_info(struct ksmbd_work *work, int buf_free_len; struct smb2_query_info_req *req = ksmbd_req_buf_next(work); - generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), &stat); file_info = (struct smb2_file_stream_info *)rsp->Buffer; @@ -4597,7 +4578,7 @@ static void get_file_internal_info(struct smb2_query_info_rsp *rsp, struct smb2_file_internal_info *file_info; struct kstat stat; - generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), &stat); file_info = (struct smb2_file_internal_info *)rsp->Buffer; file_info->IndexNumber = cpu_to_le64(stat.ino); @@ -4623,7 +4604,7 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer; inode = file_inode(fp->filp); - generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); file_info->CreationTime = cpu_to_le64(fp->create_time); time = ksmbd_UnixTimeToNT(stat.atime); @@ -4684,7 +4665,7 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp, struct smb2_file_comp_info *file_info; struct kstat stat; - generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), &stat); file_info = (struct smb2_file_comp_info *)rsp->Buffer; @@ -4725,9 +4706,9 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, { struct smb311_posix_qinfo *file_info; struct inode *inode = file_inode(fp->filp); - struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); - vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode); + struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); u64 time; int out_buf_len = sizeof(struct smb311_posix_qinfo) + 32; @@ -5127,7 +5108,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work, struct smb2_query_info_rsp *rsp) { struct ksmbd_file *fp; - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct smb_ntsd *pntsd = (struct smb_ntsd *)rsp->Buffer, *ppntsd = NULL; struct smb_fattr fattr = {{0}}; struct inode *inode; @@ -5174,19 +5155,19 @@ static int smb2_get_info_sec(struct ksmbd_work *work, if (!fp) return -ENOENT; - user_ns = file_mnt_user_ns(fp->filp); + idmap = file_mnt_idmap(fp->filp); inode = file_inode(fp->filp); - ksmbd_acls_fattr(&fattr, user_ns, inode); + ksmbd_acls_fattr(&fattr, idmap, inode); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) - ppntsd_size = ksmbd_vfs_get_sd_xattr(work->conn, user_ns, + ppntsd_size = ksmbd_vfs_get_sd_xattr(work->conn, idmap, fp->filp->f_path.dentry, &ppntsd); /* Check if sd buffer size exceeds response buffer size */ if (smb2_resp_buf_len(work, 8) > ppntsd_size) - rc = build_sec_desc(user_ns, pntsd, ppntsd, ppntsd_size, + rc = build_sec_desc(idmap, pntsd, ppntsd, ppntsd_size, addition_info, &secdesclen, &fattr); posix_acl_release(fattr.cf_acls); posix_acl_release(fattr.cf_dacls); @@ -5416,7 +5397,7 @@ int smb2_echo(struct ksmbd_work *work) static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct smb2_file_rename_info *file_info, struct nls_table *local_nls) { @@ -5479,7 +5460,7 @@ static int smb2_rename(struct ksmbd_work *work, if (rc) goto out; - rc = ksmbd_vfs_setxattr(user_ns, + rc = ksmbd_vfs_setxattr(idmap, fp->filp->f_path.dentry, xattr_stream_name, NULL, 0, 0); @@ -5618,7 +5599,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, struct iattr attrs; struct file *filp; struct inode *inode; - struct user_namespace *user_ns; + struct mnt_idmap *idmap; int rc = 0; if (!(fp->daccess & FILE_WRITE_ATTRIBUTES_LE)) @@ -5627,7 +5608,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, attrs.ia_valid = 0; filp = fp->filp; inode = file_inode(filp); - user_ns = file_mnt_user_ns(filp); + idmap = file_mnt_idmap(filp); if (file_info->CreationTime) fp->create_time = le64_to_cpu(file_info->CreationTime); @@ -5671,7 +5652,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(user_ns, + rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, filp->f_path.dentry, &da); if (rc) ksmbd_debug(SMB, @@ -5689,7 +5670,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, inode_lock(inode); inode->i_ctime = attrs.ia_ctime; attrs.ia_valid &= ~ATTR_CTIME; - rc = notify_change(user_ns, dentry, &attrs, NULL); + rc = notify_change(idmap, dentry, &attrs, NULL); inode_unlock(inode); } return rc; @@ -5782,7 +5763,7 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, struct smb2_file_rename_info *rename_info, unsigned int buf_len) { - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct ksmbd_file *parent_fp; struct dentry *parent; struct dentry *dentry = fp->filp->f_path.dentry; @@ -5797,12 +5778,12 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, le32_to_cpu(rename_info->FileNameLength)) return -EINVAL; - user_ns = file_mnt_user_ns(fp->filp); + idmap = file_mnt_idmap(fp->filp); if (ksmbd_stream_fd(fp)) goto next; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); + ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); if (ret) { dput(parent); return ret; @@ -5821,7 +5802,7 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, ksmbd_fd_put(work, parent_fp); } next: - return smb2_rename(work, fp, user_ns, rename_info, + return smb2_rename(work, fp, idmap, rename_info, work->conn->local_nls); } @@ -6644,7 +6625,7 @@ int smb2_cancel(struct ksmbd_work *work) struct ksmbd_conn *conn = work->conn; struct smb2_hdr *hdr = smb2_get_msg(work->request_buf); struct smb2_hdr *chdr; - struct ksmbd_work *cancel_work = NULL, *iter; + struct ksmbd_work *iter; struct list_head *command_list; ksmbd_debug(SMB, "smb2 cancel called on mid %llu, async flags 0x%x\n", @@ -6666,7 +6647,9 @@ int smb2_cancel(struct ksmbd_work *work) "smb2 with AsyncId %llu cancelled command = 0x%x\n", le64_to_cpu(hdr->Id.AsyncId), le16_to_cpu(chdr->Command)); - cancel_work = iter; + iter->state = KSMBD_WORK_CANCELLED; + if (iter->cancel_fn) + iter->cancel_fn(iter->cancel_argv); break; } spin_unlock(&conn->request_lock); @@ -6685,18 +6668,12 @@ int smb2_cancel(struct ksmbd_work *work) "smb2 with mid %llu cancelled command = 0x%x\n", le64_to_cpu(hdr->MessageId), le16_to_cpu(chdr->Command)); - cancel_work = iter; + iter->state = KSMBD_WORK_CANCELLED; break; } spin_unlock(&conn->request_lock); } - if (cancel_work) { - cancel_work->state = KSMBD_WORK_CANCELLED; - if (cancel_work->cancel_fn) - cancel_work->cancel_fn(cancel_work->cancel_argv); - } - /* For SMB2_CANCEL command itself send no response*/ work->send_no_response = 1; return 0; @@ -7061,6 +7038,14 @@ skip: ksmbd_vfs_posix_lock_wait(flock); + spin_lock(&work->conn->request_lock); + spin_lock(&fp->f_lock); + list_del(&work->fp_entry); + work->cancel_fn = NULL; + kfree(argv); + spin_unlock(&fp->f_lock); + spin_unlock(&work->conn->request_lock); + if (work->state != KSMBD_WORK_ACTIVE) { list_del(&smb_lock->llist); spin_lock(&work->conn->llist_lock); @@ -7069,9 +7054,6 @@ skip: locks_free_lock(flock); if (work->state == KSMBD_WORK_CANCELLED) { - spin_lock(&fp->f_lock); - list_del(&work->fp_entry); - spin_unlock(&fp->f_lock); rsp->hdr.Status = STATUS_CANCELLED; kfree(smb_lock); @@ -7093,9 +7075,6 @@ skip: list_del(&smb_lock->clist); spin_unlock(&work->conn->llist_lock); - spin_lock(&fp->f_lock); - list_del(&work->fp_entry); - spin_unlock(&fp->f_lock); goto retry; } else if (!rc) { spin_lock(&work->conn->llist_lock); @@ -7530,14 +7509,14 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, struct file_sparse *sparse) { struct ksmbd_file *fp; - struct user_namespace *user_ns; + struct mnt_idmap *idmap; int ret = 0; __le32 old_fattr; fp = ksmbd_lookup_fd_fast(work, id); if (!fp) return -ENOENT; - user_ns = file_mnt_user_ns(fp->filp); + idmap = file_mnt_idmap(fp->filp); old_fattr = fp->f_ci->m_fattr; if (sparse->SetSparse) @@ -7550,13 +7529,13 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) { struct xattr_dos_attrib da; - ret = ksmbd_vfs_get_dos_attrib_xattr(user_ns, + ret = ksmbd_vfs_get_dos_attrib_xattr(idmap, fp->filp->f_path.dentry, &da); if (ret <= 0) goto out; da.attr = le32_to_cpu(fp->f_ci->m_fattr); - ret = ksmbd_vfs_set_dos_attrib_xattr(user_ns, + ret = ksmbd_vfs_set_dos_attrib_xattr(idmap, fp->filp->f_path.dentry, &da); if (ret) fp->f_ci->m_fattr = old_fattr; @@ -8409,14 +8388,11 @@ int smb3_check_sign_req(struct ksmbd_work *work) if (le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) { signing_key = work->sess->smb3signingkey; } else { - read_lock(&work->sess->chann_lock); chann = lookup_chann_list(work->sess, conn); if (!chann) { - read_unlock(&work->sess->chann_lock); return 0; } signing_key = chann->smb3signingkey; - read_unlock(&work->sess->chann_lock); } if (!signing_key) { @@ -8476,14 +8452,11 @@ void smb3_set_sign_rsp(struct ksmbd_work *work) le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) { signing_key = work->sess->smb3signingkey; } else { - read_lock(&work->sess->chann_lock); chann = lookup_chann_list(work->sess, work->conn); if (!chann) { - read_unlock(&work->sess->chann_lock); return; } signing_key = chann->smb3signingkey; - read_unlock(&work->sess->chann_lock); } if (!signing_key) @@ -8663,6 +8636,7 @@ int smb3_decrypt_req(struct ksmbd_work *work) bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; struct smb2_hdr *rsp = smb2_get_msg(work->response_buf); if (conn->dialect < SMB30_PROT_ID) @@ -8672,6 +8646,7 @@ bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work) rsp = ksmbd_resp_buf_next(work); if (le16_to_cpu(rsp->Command) == SMB2_SESSION_SETUP_HE && + sess->user && !user_guest(sess->user) && rsp->Status == STATUS_SUCCESS) return true; return false; diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index aa5dbe54f5a1..0c8a770fe318 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -24,8 +24,9 @@ #define SMB21_DEFAULT_IOSIZE (1024 * 1024) #define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024) -#define SMB3_MIN_IOSIZE (64 * 1024) -#define SMB3_MAX_IOSIZE (8 * 1024 * 1024) +#define SMB3_MIN_IOSIZE (64 * 1024) +#define SMB3_MAX_IOSIZE (8 * 1024 * 1024) +#define SMB3_MAX_MSGSIZE (4 * 4096) /* * Definitions for SMB2 Protocol Data Units (network frames) diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 2a4fbbd55b91..fa2b54df6ee6 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -307,7 +307,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, { int i, rc = 0; struct ksmbd_conn *conn = work->conn; - struct user_namespace *user_ns = file_mnt_user_ns(dir->filp); + struct mnt_idmap *idmap = file_mnt_idmap(dir->filp); for (i = 0; i < 2; i++) { struct kstat kstat; @@ -333,7 +333,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, ksmbd_kstat.kstat = &kstat; ksmbd_vfs_fill_dentry_attrs(work, - user_ns, + idmap, dentry, &ksmbd_kstat); rc = fn(conn, info_level, d_info, &ksmbd_kstat); diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index ab5c68cc0e13..6d6cfb6957a9 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -254,7 +254,7 @@ void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid) ssid->num_subauth++; } -static int sid_to_id(struct user_namespace *user_ns, +static int sid_to_id(struct mnt_idmap *idmap, struct smb_sid *psid, uint sidtype, struct smb_fattr *fattr) { @@ -276,7 +276,7 @@ static int sid_to_id(struct user_namespace *user_ns, id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); uid = KUIDT_INIT(id); - uid = from_vfsuid(user_ns, &init_user_ns, VFSUIDT_INIT(uid)); + uid = from_vfsuid(idmap, &init_user_ns, VFSUIDT_INIT(uid)); if (uid_valid(uid)) { fattr->cf_uid = uid; rc = 0; @@ -287,7 +287,7 @@ static int sid_to_id(struct user_namespace *user_ns, id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); gid = KGIDT_INIT(id); - gid = from_vfsgid(user_ns, &init_user_ns, VFSGIDT_INIT(gid)); + gid = from_vfsgid(idmap, &init_user_ns, VFSGIDT_INIT(gid)); if (gid_valid(gid)) { fattr->cf_gid = gid; rc = 0; @@ -362,7 +362,7 @@ void free_acl_state(struct posix_acl_state *state) kfree(state->groups); } -static void parse_dacl(struct user_namespace *user_ns, +static void parse_dacl(struct mnt_idmap *idmap, struct smb_acl *pdacl, char *end_of_acl, struct smb_sid *pownersid, struct smb_sid *pgrpsid, struct smb_fattr *fattr) @@ -489,7 +489,7 @@ static void parse_dacl(struct user_namespace *user_ns, acl_mode = access_flags_to_mode(fattr, ppace[i]->access_req, ppace[i]->type); temp_fattr.cf_uid = INVALID_UID; - ret = sid_to_id(user_ns, &ppace[i]->sid, SIDOWNER, &temp_fattr); + ret = sid_to_id(idmap, &ppace[i]->sid, SIDOWNER, &temp_fattr); if (ret || uid_eq(temp_fattr.cf_uid, INVALID_UID)) { pr_err("%s: Error %d mapping Owner SID to uid\n", __func__, ret); @@ -575,7 +575,7 @@ static void parse_dacl(struct user_namespace *user_ns, free_acl_state(&default_acl_state); } -static void set_posix_acl_entries_dacl(struct user_namespace *user_ns, +static void set_posix_acl_entries_dacl(struct mnt_idmap *idmap, struct smb_ace *pndace, struct smb_fattr *fattr, u32 *num_aces, u16 *size, u32 nt_aces_num) @@ -600,14 +600,14 @@ static void set_posix_acl_entries_dacl(struct user_namespace *user_ns, uid_t uid; unsigned int sid_type = SIDOWNER; - uid = posix_acl_uid_translate(user_ns, pace); + uid = posix_acl_uid_translate(idmap, pace); if (!uid) sid_type = SIDUNIX_USER; id_to_sid(uid, sid_type, sid); } else if (pace->e_tag == ACL_GROUP) { gid_t gid; - gid = posix_acl_gid_translate(user_ns, pace); + gid = posix_acl_gid_translate(idmap, pace); id_to_sid(gid, SIDUNIX_GROUP, sid); } else if (pace->e_tag == ACL_OTHER && !nt_aces_num) { smb_copy_sid(sid, &sid_everyone); @@ -666,12 +666,12 @@ posix_default_acl: if (pace->e_tag == ACL_USER) { uid_t uid; - uid = posix_acl_uid_translate(user_ns, pace); + uid = posix_acl_uid_translate(idmap, pace); id_to_sid(uid, SIDCREATOR_OWNER, sid); } else if (pace->e_tag == ACL_GROUP) { gid_t gid; - gid = posix_acl_gid_translate(user_ns, pace); + gid = posix_acl_gid_translate(idmap, pace); id_to_sid(gid, SIDCREATOR_GROUP, sid); } else { kfree(sid); @@ -689,7 +689,7 @@ posix_default_acl: } } -static void set_ntacl_dacl(struct user_namespace *user_ns, +static void set_ntacl_dacl(struct mnt_idmap *idmap, struct smb_acl *pndacl, struct smb_acl *nt_dacl, unsigned int aces_size, @@ -723,13 +723,13 @@ static void set_ntacl_dacl(struct user_namespace *user_ns, } } - set_posix_acl_entries_dacl(user_ns, pndace, fattr, + set_posix_acl_entries_dacl(idmap, pndace, fattr, &num_aces, &size, nt_num_aces); pndacl->num_aces = cpu_to_le32(num_aces); pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size); } -static void set_mode_dacl(struct user_namespace *user_ns, +static void set_mode_dacl(struct mnt_idmap *idmap, struct smb_acl *pndacl, struct smb_fattr *fattr) { struct smb_ace *pace, *pndace; @@ -741,7 +741,7 @@ static void set_mode_dacl(struct user_namespace *user_ns, pace = pndace = (struct smb_ace *)((char *)pndacl + sizeof(struct smb_acl)); if (fattr->cf_acls) { - set_posix_acl_entries_dacl(user_ns, pndace, fattr, + set_posix_acl_entries_dacl(idmap, pndace, fattr, &num_aces, &size, num_aces); goto out; } @@ -808,7 +808,7 @@ static int parse_sid(struct smb_sid *psid, char *end_of_acl) } /* Convert CIFS ACL to POSIX form */ -int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, +int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, int acl_len, struct smb_fattr *fattr) { int rc = 0; @@ -851,7 +851,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, return rc; } - rc = sid_to_id(user_ns, owner_sid_ptr, SIDOWNER, fattr); + rc = sid_to_id(idmap, owner_sid_ptr, SIDOWNER, fattr); if (rc) { pr_err("%s: Error %d mapping Owner SID to uid\n", __func__, rc); @@ -866,7 +866,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, __func__, rc); return rc; } - rc = sid_to_id(user_ns, group_sid_ptr, SIDUNIX_GROUP, fattr); + rc = sid_to_id(idmap, group_sid_ptr, SIDUNIX_GROUP, fattr); if (rc) { pr_err("%s: Error %d mapping Group SID to gid\n", __func__, rc); @@ -881,7 +881,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, pntsd->type |= cpu_to_le16(DACL_PROTECTED); if (dacloffset) { - parse_dacl(user_ns, dacl_ptr, end_of_acl, + parse_dacl(idmap, dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr, fattr); } @@ -889,7 +889,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, } /* Convert permission bits from mode to equivalent CIFS ACL */ -int build_sec_desc(struct user_namespace *user_ns, +int build_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, struct smb_ntsd *ppntsd, int ppntsd_size, int addition_info, __u32 *secdesclen, struct smb_fattr *fattr) @@ -950,7 +950,7 @@ int build_sec_desc(struct user_namespace *user_ns, dacl_ptr->num_aces = 0; if (!ppntsd) { - set_mode_dacl(user_ns, dacl_ptr, fattr); + set_mode_dacl(idmap, dacl_ptr, fattr); } else { struct smb_acl *ppdacl_ptr; unsigned int dacl_offset = le32_to_cpu(ppntsd->dacloffset); @@ -966,7 +966,7 @@ int build_sec_desc(struct user_namespace *user_ns, ppdacl_size < sizeof(struct smb_acl)) goto out; - set_ntacl_dacl(user_ns, dacl_ptr, ppdacl_ptr, + set_ntacl_dacl(idmap, dacl_ptr, ppdacl_ptr, ntacl_size - sizeof(struct smb_acl), nowner_sid_ptr, ngroup_sid_ptr, fattr); @@ -1002,13 +1002,13 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, struct smb_ntsd *parent_pntsd = NULL; struct smb_sid owner_sid, group_sid; struct dentry *parent = path->dentry->d_parent; - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); int inherited_flags = 0, flags = 0, i, ace_cnt = 0, nt_size = 0, pdacl_size; int rc = 0, num_aces, dacloffset, pntsd_type, pntsd_size, acl_len, aces_size; char *aces_base; bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode); - pntsd_size = ksmbd_vfs_get_sd_xattr(conn, user_ns, + pntsd_size = ksmbd_vfs_get_sd_xattr(conn, idmap, parent, &parent_pntsd); if (pntsd_size <= 0) return -ENOENT; @@ -1162,7 +1162,7 @@ pass: pntsd_size += sizeof(struct smb_acl) + nt_size; } - ksmbd_vfs_set_sd_xattr(conn, user_ns, + ksmbd_vfs_set_sd_xattr(conn, idmap, path->dentry, pntsd, pntsd_size); kfree(pntsd); } @@ -1190,7 +1190,7 @@ bool smb_inherit_flags(int flags, bool is_dir) int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, __le32 *pdaccess, int uid) { - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); struct smb_ntsd *pntsd = NULL; struct smb_acl *pdacl; struct posix_acl *posix_acls; @@ -1206,7 +1206,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, unsigned short ace_size; ksmbd_debug(SMB, "check permission using windows acl\n"); - pntsd_size = ksmbd_vfs_get_sd_xattr(conn, user_ns, + pntsd_size = ksmbd_vfs_get_sd_xattr(conn, idmap, path->dentry, &pntsd); if (pntsd_size <= 0 || !pntsd) goto err_out; @@ -1296,9 +1296,9 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, pa_entry = posix_acls->a_entries; for (i = 0; i < posix_acls->a_count; i++, pa_entry++) { if (pa_entry->e_tag == ACL_USER) - id = posix_acl_uid_translate(user_ns, pa_entry); + id = posix_acl_uid_translate(idmap, pa_entry); else if (pa_entry->e_tag == ACL_GROUP) - id = posix_acl_gid_translate(user_ns, pa_entry); + id = posix_acl_gid_translate(idmap, pa_entry); else continue; @@ -1360,14 +1360,14 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, int rc; struct smb_fattr fattr = {{0}}; struct inode *inode = d_inode(path->dentry); - struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct mnt_idmap *idmap = mnt_idmap(path->mnt); struct iattr newattrs; fattr.cf_uid = INVALID_UID; fattr.cf_gid = INVALID_GID; fattr.cf_mode = inode->i_mode; - rc = parse_sec_desc(user_ns, pntsd, ntsd_len, &fattr); + rc = parse_sec_desc(idmap, pntsd, ntsd_len, &fattr); if (rc) goto out; @@ -1383,17 +1383,17 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, newattrs.ia_valid |= ATTR_MODE; newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777); - ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry); + ksmbd_vfs_remove_acl_xattrs(idmap, path->dentry); /* Update posix acls */ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { - rc = set_posix_acl(user_ns, path->dentry, + rc = set_posix_acl(idmap, path->dentry, ACL_TYPE_ACCESS, fattr.cf_acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) { - rc = set_posix_acl(user_ns, path->dentry, + rc = set_posix_acl(idmap, path->dentry, ACL_TYPE_DEFAULT, fattr.cf_dacls); if (rc) ksmbd_debug(SMB, @@ -1403,7 +1403,7 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, } inode_lock(inode); - rc = notify_change(user_ns, path->dentry, &newattrs, NULL); + rc = notify_change(idmap, path->dentry, &newattrs, NULL); inode_unlock(inode); if (rc) goto out; @@ -1414,8 +1414,8 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { /* Update WinACL in xattr */ - ksmbd_vfs_remove_sd_xattrs(user_ns, path->dentry); - ksmbd_vfs_set_sd_xattr(conn, user_ns, + ksmbd_vfs_remove_sd_xattrs(idmap, path->dentry); + ksmbd_vfs_set_sd_xattr(conn, idmap, path->dentry, pntsd, ntsd_len); } diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h index 618f2e0236b3..49a8c292bd2e 100644 --- a/fs/ksmbd/smbacl.h +++ b/fs/ksmbd/smbacl.h @@ -190,9 +190,9 @@ struct posix_acl_state { struct posix_ace_state_array *groups; }; -int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, +int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, int acl_len, struct smb_fattr *fattr); -int build_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, +int build_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, struct smb_ntsd *ppntsd, int ppntsd_size, int addition_info, __u32 *secdesclen, struct smb_fattr *fattr); int init_acl_state(struct posix_acl_state *state, int cnt); @@ -211,25 +211,25 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid); void ksmbd_init_domain(u32 *sub_auth); -static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns, +static inline uid_t posix_acl_uid_translate(struct mnt_idmap *idmap, struct posix_acl_entry *pace) { vfsuid_t vfsuid; /* If this is an idmapped mount, apply the idmapping. */ - vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pace->e_uid); + vfsuid = make_vfsuid(idmap, &init_user_ns, pace->e_uid); /* Translate the kuid into a userspace id ksmbd would see. */ return from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid)); } -static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns, +static inline gid_t posix_acl_gid_translate(struct mnt_idmap *idmap, struct posix_acl_entry *pace) { vfsgid_t vfsgid; /* If this is an idmapped mount, apply the idmapping. */ - vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pace->e_gid); + vfsgid = make_vfsgid(idmap, &init_user_ns, pace->e_gid); /* Translate the kgid into a userspace id ksmbd would see. */ return from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid)); diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index c9aca21637d5..40c721f9227e 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -308,6 +308,9 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) if (req->smbd_max_io_size) init_smbd_max_io_size(req->smbd_max_io_size); + if (req->max_connections) + server_conf.max_connections = req->max_connections; + ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); ret |= ksmbd_set_work_group(req->work_group); diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 4c6bd0b69979..603893fd87f5 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -15,6 +15,8 @@ #define IFACE_STATE_DOWN BIT(0) #define IFACE_STATE_CONFIGURED BIT(1) +static atomic_t active_num_conn; + struct interface { struct task_struct *ksmbd_kthread; struct socket *ksmbd_socket; @@ -185,8 +187,10 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) struct tcp_transport *t; t = alloc_transport(client_sk); - if (!t) + if (!t) { + sock_release(client_sk); return -ENOMEM; + } csin = KSMBD_TCP_PEER_SOCKADDR(KSMBD_TRANS(t)->conn); if (kernel_getpeername(client_sk, csin) < 0) { @@ -239,6 +243,15 @@ static int ksmbd_kthread_fn(void *p) continue; } + if (server_conf.max_connections && + atomic_inc_return(&active_num_conn) >= server_conf.max_connections) { + pr_info_ratelimited("Limit the maximum number of connections(%u)\n", + atomic_read(&active_num_conn)); + atomic_dec(&active_num_conn); + sock_release(client_sk); + continue; + } + ksmbd_debug(CONN, "connect success: accepted new connection\n"); client_sk->sk->sk_rcvtimeo = KSMBD_TCP_RECV_TIMEOUT; client_sk->sk->sk_sndtimeo = KSMBD_TCP_SEND_TIMEOUT; @@ -368,6 +381,8 @@ static int ksmbd_tcp_writev(struct ksmbd_transport *t, struct kvec *iov, static void ksmbd_tcp_disconnect(struct ksmbd_transport *t) { free_transport(TCP_TRANS(t)); + if (server_conf.max_connections) + atomic_dec(&active_num_conn); } static void tcp_destroy_socket(struct socket *ksmbd_socket) diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index ff0e7a4fcd4d..5ea9229dad2c 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/uaccess.h> #include <linux/backing-dev.h> #include <linux/writeback.h> @@ -69,14 +70,14 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, * * the reference count of @parent isn't incremented. */ -int ksmbd_vfs_lock_parent(struct user_namespace *user_ns, struct dentry *parent, +int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent, struct dentry *child) { struct dentry *dentry; int ret = 0; inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - dentry = lookup_one(user_ns, child->d_name.name, parent, + dentry = lookup_one(idmap, child->d_name.name, parent, child->d_name.len); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); @@ -96,20 +97,20 @@ out_err: return ret; } -int ksmbd_vfs_may_delete(struct user_namespace *user_ns, +int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, struct dentry *dentry) { struct dentry *parent; int ret; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); + ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); if (ret) { dput(parent); return ret; } - ret = inode_permission(user_ns, d_inode(parent), + ret = inode_permission(idmap, d_inode(parent), MAY_EXEC | MAY_WRITE); inode_unlock(d_inode(parent)); @@ -117,7 +118,7 @@ int ksmbd_vfs_may_delete(struct user_namespace *user_ns, return ret; } -int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, +int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess) { struct dentry *parent; @@ -125,26 +126,26 @@ int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, *daccess = cpu_to_le32(FILE_READ_ATTRIBUTES | READ_CONTROL); - if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_WRITE)) + if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_WRITE)) *daccess |= cpu_to_le32(WRITE_DAC | WRITE_OWNER | SYNCHRONIZE | FILE_WRITE_DATA | FILE_APPEND_DATA | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES | FILE_DELETE_CHILD); - if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_READ)) + if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_READ)) *daccess |= FILE_READ_DATA_LE | FILE_READ_EA_LE; - if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_EXEC)) + if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_EXEC)) *daccess |= FILE_EXECUTE_LE; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); + ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); if (ret) { dput(parent); return ret; } - if (!inode_permission(user_ns, d_inode(parent), MAY_EXEC | MAY_WRITE)) + if (!inode_permission(idmap, d_inode(parent), MAY_EXEC | MAY_WRITE)) *daccess |= FILE_DELETE_LE; inode_unlock(d_inode(parent)); @@ -177,7 +178,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) } mode |= S_IFREG; - err = vfs_create(mnt_user_ns(path.mnt), d_inode(path.dentry), + err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry), dentry, mode, true); if (!err) { ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), @@ -199,7 +200,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) */ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) { - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct path path; struct dentry *dentry; int err; @@ -215,15 +216,15 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) return err; } - user_ns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; - err = vfs_mkdir(user_ns, d_inode(path.dentry), dentry, mode); + err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); if (err) { goto out; } else if (d_unhashed(dentry)) { struct dentry *d; - d = lookup_one(user_ns, dentry->d_name.name, dentry->d_parent, + d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent, dentry->d_name.len); if (IS_ERR(d)) { err = PTR_ERR(d); @@ -245,7 +246,7 @@ out: return err; } -static ssize_t ksmbd_vfs_getcasexattr(struct user_namespace *user_ns, +static ssize_t ksmbd_vfs_getcasexattr(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len, char **attr_value) { @@ -262,7 +263,7 @@ static ssize_t ksmbd_vfs_getcasexattr(struct user_namespace *user_ns, if (strncasecmp(attr_name, name, attr_name_len)) continue; - value_len = ksmbd_vfs_getxattr(user_ns, + value_len = ksmbd_vfs_getxattr(idmap, dentry, name, attr_value); @@ -285,7 +286,7 @@ static int ksmbd_vfs_stream_read(struct ksmbd_file *fp, char *buf, loff_t *pos, ksmbd_debug(VFS, "read stream data pos : %llu, count : %zd\n", *pos, count); - v_len = ksmbd_vfs_getcasexattr(file_mnt_user_ns(fp->filp), + v_len = ksmbd_vfs_getcasexattr(file_mnt_idmap(fp->filp), fp->filp->f_path.dentry, fp->stream.name, fp->stream.size, @@ -409,7 +410,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, size_t count) { char *stream_buf = NULL, *wbuf; - struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + struct mnt_idmap *idmap = file_mnt_idmap(fp->filp); size_t size, v_len; int err = 0; @@ -422,7 +423,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, count = (*pos + count) - XATTR_SIZE_MAX; } - v_len = ksmbd_vfs_getcasexattr(user_ns, + v_len = ksmbd_vfs_getcasexattr(idmap, fp->filp->f_path.dentry, fp->stream.name, fp->stream.size, @@ -448,7 +449,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, memcpy(&stream_buf[*pos], buf, count); - err = ksmbd_vfs_setxattr(user_ns, + err = ksmbd_vfs_setxattr(idmap, fp->filp->f_path.dentry, fp->stream.name, (void *)stream_buf, @@ -583,7 +584,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id) */ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) { - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct path path; struct dentry *parent; int err; @@ -598,9 +599,9 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) return err; } - user_ns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); parent = dget_parent(path.dentry); - err = ksmbd_vfs_lock_parent(user_ns, parent, path.dentry); + err = ksmbd_vfs_lock_parent(idmap, parent, path.dentry); if (err) { dput(parent); path_put(&path); @@ -614,12 +615,12 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) } if (S_ISDIR(d_inode(path.dentry)->i_mode)) { - err = vfs_rmdir(user_ns, d_inode(parent), path.dentry); + err = vfs_rmdir(idmap, d_inode(parent), path.dentry); if (err && err != -ENOTEMPTY) ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name, err); } else { - err = vfs_unlink(user_ns, d_inode(parent), path.dentry, NULL); + err = vfs_unlink(idmap, d_inode(parent), path.dentry, NULL); if (err) ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name, err); @@ -672,7 +673,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, goto out3; } - err = vfs_link(oldpath.dentry, mnt_user_ns(newpath.mnt), + err = vfs_link(oldpath.dentry, mnt_idmap(newpath.mnt), d_inode(newpath.dentry), dentry, NULL); if (err) @@ -711,10 +712,10 @@ static int ksmbd_validate_entry_in_use(struct dentry *src_dent) } static int __ksmbd_vfs_rename(struct ksmbd_work *work, - struct user_namespace *src_user_ns, + struct mnt_idmap *src_idmap, struct dentry *src_dent_parent, struct dentry *src_dent, - struct user_namespace *dst_user_ns, + struct mnt_idmap *dst_idmap, struct dentry *dst_dent_parent, struct dentry *trap_dent, char *dst_name) @@ -740,8 +741,8 @@ static int __ksmbd_vfs_rename(struct ksmbd_work *work, if (ksmbd_override_fsids(work)) return -ENOMEM; - dst_dent = lookup_one(dst_user_ns, dst_name, dst_dent_parent, - strlen(dst_name)); + dst_dent = lookup_one(dst_idmap, dst_name, + dst_dent_parent, strlen(dst_name)); err = PTR_ERR(dst_dent); if (IS_ERR(dst_dent)) { pr_err("lookup failed %s [%d]\n", dst_name, err); @@ -751,10 +752,10 @@ static int __ksmbd_vfs_rename(struct ksmbd_work *work, err = -ENOTEMPTY; if (dst_dent != trap_dent && !d_really_is_positive(dst_dent)) { struct renamedata rd = { - .old_mnt_userns = src_user_ns, + .old_mnt_idmap = src_idmap, .old_dir = d_inode(src_dent_parent), .old_dentry = src_dent, - .new_mnt_userns = dst_user_ns, + .new_mnt_idmap = dst_idmap, .new_dir = d_inode(dst_dent_parent), .new_dentry = dst_dent, }; @@ -772,7 +773,7 @@ out: int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, char *newname) { - struct user_namespace *user_ns; + struct mnt_idmap *idmap; struct path dst_path; struct dentry *src_dent_parent, *dst_dent_parent; struct dentry *src_dent, *trap_dent, *src_child; @@ -800,8 +801,8 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, trap_dent = lock_rename(src_dent_parent, dst_dent_parent); dget(src_dent); dget(dst_dent_parent); - user_ns = file_mnt_user_ns(fp->filp); - src_child = lookup_one(user_ns, src_dent->d_name.name, src_dent_parent, + idmap = file_mnt_idmap(fp->filp); + src_child = lookup_one(idmap, src_dent->d_name.name, src_dent_parent, src_dent->d_name.len); if (IS_ERR(src_child)) { err = PTR_ERR(src_child); @@ -816,10 +817,10 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, dput(src_child); err = __ksmbd_vfs_rename(work, - user_ns, + idmap, src_dent_parent, src_dent, - mnt_user_ns(dst_path.mnt), + mnt_idmap(dst_path.mnt), dst_dent_parent, trap_dent, dst_name); @@ -907,22 +908,22 @@ ssize_t ksmbd_vfs_listxattr(struct dentry *dentry, char **list) return size; } -static ssize_t ksmbd_vfs_xattr_len(struct user_namespace *user_ns, +static ssize_t ksmbd_vfs_xattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *xattr_name) { - return vfs_getxattr(user_ns, dentry, xattr_name, NULL, 0); + return vfs_getxattr(idmap, dentry, xattr_name, NULL, 0); } /** * ksmbd_vfs_getxattr() - vfs helper for smb get extended attributes value - * @user_ns: user namespace + * @idmap: idmap * @dentry: dentry of file for getting xattrs * @xattr_name: name of xattr name to query * @xattr_buf: destination buffer xattr value * * Return: read xattr value length on success, otherwise error */ -ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, +ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, char *xattr_name, char **xattr_buf) { @@ -930,7 +931,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, char *buf; *xattr_buf = NULL; - xattr_len = ksmbd_vfs_xattr_len(user_ns, dentry, xattr_name); + xattr_len = ksmbd_vfs_xattr_len(idmap, dentry, xattr_name); if (xattr_len < 0) return xattr_len; @@ -938,7 +939,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, if (!buf) return -ENOMEM; - xattr_len = vfs_getxattr(user_ns, dentry, xattr_name, + xattr_len = vfs_getxattr(idmap, dentry, xattr_name, (void *)buf, xattr_len); if (xattr_len > 0) *xattr_buf = buf; @@ -949,22 +950,22 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, /** * ksmbd_vfs_setxattr() - vfs helper for smb set extended attributes value - * @user_ns: user namespace + * @idmap: idmap of the relevant mount * @dentry: dentry to set XATTR at - * @name: xattr name for setxattr - * @value: xattr value to set - * @size: size of xattr value + * @attr_name: xattr name for setxattr + * @attr_value: xattr value to set + * @attr_size: size of xattr value * @flags: destination buffer length * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_setxattr(struct user_namespace *user_ns, +int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *attr_name, void *attr_value, size_t attr_size, int flags) { int err; - err = vfs_setxattr(user_ns, + err = vfs_setxattr(idmap, dentry, attr_name, attr_value, @@ -1074,26 +1075,26 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, return ret; } -int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name) { - return vfs_removexattr(user_ns, dentry, attr_name); + return vfs_removexattr(idmap, dentry, attr_name); } -int ksmbd_vfs_unlink(struct user_namespace *user_ns, +int ksmbd_vfs_unlink(struct mnt_idmap *idmap, struct dentry *dir, struct dentry *dentry) { int err = 0; - err = ksmbd_vfs_lock_parent(user_ns, dir, dentry); + err = ksmbd_vfs_lock_parent(idmap, dir, dentry); if (err) return err; dget(dentry); if (S_ISDIR(d_inode(dentry)->i_mode)) - err = vfs_rmdir(user_ns, d_inode(dir), dentry); + err = vfs_rmdir(idmap, d_inode(dir), dentry); else - err = vfs_unlink(user_ns, d_inode(dir), dentry, NULL); + err = vfs_unlink(idmap, d_inode(dir), dentry, NULL); dput(dentry); inode_unlock(d_inode(dir)); @@ -1298,7 +1299,7 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, return dent; } -int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, +int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, struct dentry *dentry) { char *name, *xattr_list = NULL; @@ -1321,7 +1322,7 @@ int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) || !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) { - err = vfs_remove_acl(user_ns, dentry, name); + err = vfs_remove_acl(idmap, dentry, name); if (err) ksmbd_debug(SMB, "remove acl xattr failed : %s\n", name); @@ -1332,7 +1333,7 @@ out: return err; } -int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns, +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, struct dentry *dentry) { char *name, *xattr_list = NULL; @@ -1352,7 +1353,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns, ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) { - err = ksmbd_vfs_remove_xattr(user_ns, dentry, name); + err = ksmbd_vfs_remove_xattr(idmap, dentry, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", name); } @@ -1362,7 +1363,7 @@ out: return err; } -static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespace *user_ns, +static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct mnt_idmap *idmap, struct inode *inode, int acl_type) { @@ -1392,14 +1393,14 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac switch (pa_entry->e_tag) { case ACL_USER: xa_entry->type = SMB_ACL_USER; - xa_entry->uid = posix_acl_uid_translate(user_ns, pa_entry); + xa_entry->uid = posix_acl_uid_translate(idmap, pa_entry); break; case ACL_USER_OBJ: xa_entry->type = SMB_ACL_USER_OBJ; break; case ACL_GROUP: xa_entry->type = SMB_ACL_GROUP; - xa_entry->gid = posix_acl_gid_translate(user_ns, pa_entry); + xa_entry->gid = posix_acl_gid_translate(idmap, pa_entry); break; case ACL_GROUP_OBJ: xa_entry->type = SMB_ACL_GROUP_OBJ; @@ -1428,7 +1429,7 @@ out: } int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd *pntsd, int len) { @@ -1461,13 +1462,13 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, return rc; } - smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_ACCESS); if (S_ISDIR(inode->i_mode)) - def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_DEFAULT); - rc = ndr_encode_posix_acl(&acl_ndr, user_ns, inode, + rc = ndr_encode_posix_acl(&acl_ndr, idmap, inode, smb_acl, def_smb_acl); if (rc) { pr_err("failed to encode ndr to posix acl\n"); @@ -1487,7 +1488,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, goto out; } - rc = ksmbd_vfs_setxattr(user_ns, dentry, + rc = ksmbd_vfs_setxattr(idmap, dentry, XATTR_NAME_SD, sd_ndr.data, sd_ndr.offset, 0); if (rc < 0) @@ -1502,7 +1503,7 @@ out: } int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd **pntsd) { @@ -1514,7 +1515,7 @@ int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, struct xattr_smb_acl *smb_acl = NULL, *def_smb_acl = NULL; __u8 cmp_hash[XATTR_SD_HASH_SIZE] = {0}; - rc = ksmbd_vfs_getxattr(user_ns, dentry, XATTR_NAME_SD, &n.data); + rc = ksmbd_vfs_getxattr(idmap, dentry, XATTR_NAME_SD, &n.data); if (rc <= 0) return rc; @@ -1523,13 +1524,13 @@ int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, if (rc) goto free_n_data; - smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_ACCESS); if (S_ISDIR(inode->i_mode)) - def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_DEFAULT); - rc = ndr_encode_posix_acl(&acl_ndr, user_ns, inode, smb_acl, + rc = ndr_encode_posix_acl(&acl_ndr, idmap, inode, smb_acl, def_smb_acl); if (rc) { pr_err("failed to encode ndr to posix acl\n"); @@ -1576,7 +1577,7 @@ free_n_data: return rc; } -int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da) { @@ -1587,7 +1588,7 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, if (err) return err; - err = ksmbd_vfs_setxattr(user_ns, dentry, XATTR_NAME_DOS_ATTRIBUTE, + err = ksmbd_vfs_setxattr(idmap, dentry, XATTR_NAME_DOS_ATTRIBUTE, (void *)n.data, n.offset, 0); if (err) ksmbd_debug(SMB, "failed to store dos attribute in xattr\n"); @@ -1596,14 +1597,14 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, return err; } -int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da) { struct ndr n; int err; - err = ksmbd_vfs_getxattr(user_ns, dentry, XATTR_NAME_DOS_ATTRIBUTE, + err = ksmbd_vfs_getxattr(idmap, dentry, XATTR_NAME_DOS_ATTRIBUTE, (char **)&n.data); if (err > 0) { n.length = err; @@ -1650,14 +1651,14 @@ void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat) } int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct ksmbd_kstat *ksmbd_kstat) { u64 time; int rc; - generic_fillattr(user_ns, d_inode(dentry), ksmbd_kstat->kstat); + generic_fillattr(idmap, d_inode(dentry), ksmbd_kstat->kstat); time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime); ksmbd_kstat->create_time = time; @@ -1675,7 +1676,7 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) { struct xattr_dos_attrib da; - rc = ksmbd_vfs_get_dos_attrib_xattr(user_ns, dentry, &da); + rc = ksmbd_vfs_get_dos_attrib_xattr(idmap, dentry, &da); if (rc > 0) { ksmbd_kstat->file_attributes = cpu_to_le32(da.attr); ksmbd_kstat->create_time = da.create_time; @@ -1687,7 +1688,7 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, return 0; } -ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, +ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len) { @@ -1704,7 +1705,7 @@ ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, if (strncasecmp(attr_name, name, attr_name_len)) continue; - value_len = ksmbd_vfs_xattr_len(user_ns, dentry, name); + value_len = ksmbd_vfs_xattr_len(idmap, dentry, name); break; } @@ -1823,7 +1824,7 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) locks_delete_block(flock); } -int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, +int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry) { struct posix_acl_state acl_state; @@ -1857,13 +1858,13 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, return -ENOMEM; } posix_state_to_acl(&acl_state, acls->a_entries); - rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls); + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); else if (S_ISDIR(inode->i_mode)) { posix_state_to_acl(&acl_state, acls->a_entries); - rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls); + rc = set_posix_acl(idmap, dentry, ACL_TYPE_DEFAULT, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); @@ -1873,7 +1874,7 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, return rc; } -int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, +int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *parent_inode) { struct posix_acl *acls; @@ -1896,12 +1897,12 @@ int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, } } - rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls); + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); if (S_ISDIR(inode->i_mode)) { - rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, + rc = set_posix_acl(idmap, dentry, ACL_TYPE_DEFAULT, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 0d73d735cc39..9d676ab0cd25 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -71,10 +71,10 @@ struct ksmbd_kstat { __le32 file_attributes; }; -int ksmbd_vfs_lock_parent(struct user_namespace *user_ns, struct dentry *parent, +int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent, struct dentry *child); -int ksmbd_vfs_may_delete(struct user_namespace *user_ns, struct dentry *dentry); -int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, +int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, struct dentry *dentry); +int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess); int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode); int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode); @@ -102,19 +102,19 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, unsigned int *chunk_size_written, loff_t *total_size_written); ssize_t ksmbd_vfs_listxattr(struct dentry *dentry, char **list); -ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, +ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, char *xattr_name, char **xattr_buf); -ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, +ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len); -int ksmbd_vfs_setxattr(struct user_namespace *user_ns, +int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *attr_name, void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); -int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name); int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, unsigned int flags, struct path *path, @@ -131,37 +131,37 @@ struct file_allocated_range_buffer; int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, struct file_allocated_range_buffer *ranges, unsigned int in_count, unsigned int *out_count); -int ksmbd_vfs_unlink(struct user_namespace *user_ns, - struct dentry *dir, struct dentry *dentry); +int ksmbd_vfs_unlink(struct mnt_idmap *idmap, struct dentry *dir, + struct dentry *dentry); void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat); int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct ksmbd_kstat *ksmbd_kstat); void ksmbd_vfs_posix_lock_wait(struct file_lock *flock); int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout); void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock); -int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, +int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, struct dentry *dentry); -int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns, +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, struct dentry *dentry); int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd *pntsd, int len); int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, - struct user_namespace *user_ns, + struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd **pntsd); -int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); -int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns, +int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); -int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, +int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry); -int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, +int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *parent_inode); #endif /* __KSMBD_VFS_H__ */ diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c index da9163b00350..054a7d2e0f48 100644 --- a/fs/ksmbd/vfs_cache.c +++ b/fs/ksmbd/vfs_cache.c @@ -5,6 +5,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -251,7 +252,7 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) filp = fp->filp; if (ksmbd_stream_fd(fp) && (ci->m_flags & S_DEL_ON_CLS_STREAM)) { ci->m_flags &= ~S_DEL_ON_CLS_STREAM; - err = ksmbd_vfs_remove_xattr(file_mnt_user_ns(filp), + err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp), filp->f_path.dentry, fp->stream.name); if (err) @@ -266,7 +267,7 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) dir = dentry->d_parent; ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING); write_unlock(&ci->m_lock); - ksmbd_vfs_unlink(file_mnt_user_ns(filp), dir, dentry); + ksmbd_vfs_unlink(file_mnt_idmap(filp), dir, dentry); write_lock(&ci->m_lock); } write_unlock(&ci->m_lock); @@ -364,12 +365,11 @@ static void __put_fd_final(struct ksmbd_work *work, struct ksmbd_file *fp) static void set_close_state_blocked_works(struct ksmbd_file *fp) { - struct ksmbd_work *cancel_work, *ctmp; + struct ksmbd_work *cancel_work; spin_lock(&fp->f_lock); - list_for_each_entry_safe(cancel_work, ctmp, &fp->blocked_works, + list_for_each_entry(cancel_work, &fp->blocked_works, fp_entry) { - list_del(&cancel_work->fp_entry); cancel_work->state = KSMBD_WORK_CLOSED; cancel_work->cancel_fn(cancel_work->cancel_argv); } diff --git a/fs/libfs.c b/fs/libfs.c index aada4e7c8713..4eda519c3002 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -28,12 +28,12 @@ #include "internal.h" -int simple_getattr(struct user_namespace *mnt_userns, const struct path *path, +int simple_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } @@ -473,7 +473,7 @@ int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, } EXPORT_SYMBOL_GPL(simple_rename_exchange); -int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -509,7 +509,7 @@ EXPORT_SYMBOL(simple_rename); /** * simple_setattr - setattr for simple filesystem - * @mnt_userns: user namespace of the target mount + * @idmap: idmap of the target mount * @dentry: dentry * @iattr: iattr structure * @@ -522,19 +522,19 @@ EXPORT_SYMBOL(simple_rename); * on simple regular filesystems. Anything that needs to change on-disk * or wire state on size changes needs its own setattr method. */ -int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(mnt_userns, dentry, iattr); + error = setattr_prepare(idmap, dentry, iattr); if (error) return error; if (iattr->ia_valid & ATTR_SIZE) truncate_setsize(inode, iattr->ia_size); - setattr_copy(mnt_userns, inode, iattr); + setattr_copy(idmap, inode, iattr); mark_inode_dirty(inode); return 0; } @@ -1315,16 +1315,16 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENOENT); } -static int empty_dir_getattr(struct user_namespace *mnt_userns, +static int empty_dir_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; } -static int empty_dir_setattr(struct user_namespace *mnt_userns, +static int empty_dir_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EPERM; @@ -1582,3 +1582,39 @@ bool inode_maybe_inc_iversion(struct inode *inode, bool force) return true; } EXPORT_SYMBOL(inode_maybe_inc_iversion); + +/** + * inode_query_iversion - read i_version for later use + * @inode: inode from which i_version should be read + * + * Read the inode i_version counter. This should be used by callers that wish + * to store the returned i_version for later comparison. This will guarantee + * that a later query of the i_version will result in a different value if + * anything has changed. + * + * In this implementation, we fetch the current value, set the QUERIED flag and + * then try to swap it into place with a cmpxchg, if it wasn't already set. If + * that fails, we try again with the newly fetched value from the cmpxchg. + */ +u64 inode_query_iversion(struct inode *inode) +{ + u64 cur, new; + + cur = inode_peek_iversion_raw(inode); + do { + /* If flag is already set, then no need to swap */ + if (cur & I_VERSION_QUERIED) { + /* + * This barrier (and the implicit barrier in the + * cmpxchg below) pairs with the barrier in + * inode_maybe_inc_iversion(). + */ + smp_mb(); + break; + } + + new = cur | I_VERSION_QUERIED; + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); + return cur >> I_VERSION_QUERIED_SHIFT; +} +EXPORT_SYMBOL(inode_query_iversion); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index a5bb3f721a9d..82b19a30e0f0 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -188,7 +188,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) continue; if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) continue; - if (nfs_compare_fh(NFS_FH(locks_inode(fl_blocked->fl_file)), fh) != 0) + if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)), fh) != 0) continue; /* Alright, we found a lock. Set the return status * and wake up the caller diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 99fffc9cb958..16b4de868cd2 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/nfs_fs.h> #include <linux/utsname.h> #include <linux/freezer.h> @@ -130,7 +131,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) char *nodename = req->a_host->h_rpcclnt->cl_nodename; nlmclnt_next_cookie(&argp->cookie); - memcpy(&lock->fh, NFS_FH(locks_inode(fl->fl_file)), sizeof(struct nfs_fh)); + memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); lock->caller = nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 5bec78c8e431..17432c445fe6 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -3,6 +3,7 @@ #define __LOCKD_NETNS_H__ #include <linux/fs.h> +#include <linux/filelock.h> #include <net/netns/generic.h> struct lockd_net { diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 59ef8a1f843f..9a47303b2cba 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -496,7 +496,7 @@ static struct ctl_table nlm_sysctls[] = { { .procname = "nsm_use_hostnames", .data = &nsm_use_hostnames, - .maxlen = sizeof(int), + .maxlen = sizeof(bool), .mode = 0644, .proc_handler = proc_dobool, }, @@ -685,17 +685,16 @@ module_exit(exit_nlm); /** * nlmsvc_dispatch - Process an NLM Request * @rqstp: incoming request - * @statp: pointer to location of accept_stat field in RPC Reply buffer * * Return values: * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ -static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp) +static int nlmsvc_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *procp = rqstp->rq_procinfo; + __be32 *statp = rqstp->rq_accept_statp; - svcxdr_init_decode(rqstp); if (!procp->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; @@ -705,7 +704,6 @@ static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp) if (*statp != rpc_success) return 1; - svcxdr_init_encode(rqstp); if (!procp->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; @@ -723,7 +721,7 @@ out_encode_err: /* * Define NLM program and procedures */ -static unsigned int nlmsvc_version1_count[17]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version1_count[17]); static const struct svc_version nlmsvc_version1 = { .vs_vers = 1, .vs_nproc = 17, @@ -732,26 +730,31 @@ static const struct svc_version nlmsvc_version1 = { .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; -static unsigned int nlmsvc_version3_count[24]; + +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nlmsvc_version3_count[ARRAY_SIZE(nlmsvc_procedures)]); static const struct svc_version nlmsvc_version3 = { .vs_vers = 3, - .vs_nproc = 24, + .vs_nproc = ARRAY_SIZE(nlmsvc_procedures), .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version3_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; + #ifdef CONFIG_LOCKD_V4 -static unsigned int nlmsvc_version4_count[24]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nlmsvc_version4_count[ARRAY_SIZE(nlmsvc_procedures4)]); static const struct svc_version nlmsvc_version4 = { .vs_vers = 4, - .vs_nproc = 24, + .vs_nproc = ARRAY_SIZE(nlmsvc_procedures4), .vs_proc = nlmsvc_procedures4, .vs_count = nlmsvc_version4_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #endif + static const struct svc_version *nlmsvc_version[] = { [1] = &nlmsvc_version1, [3] = &nlmsvc_version3, diff --git a/fs/locks.c b/fs/locks.c index 8f01bee17715..66b4eef09db5 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -52,6 +52,7 @@ #include <linux/capability.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/filelock.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/security.h> @@ -233,7 +234,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type) { struct file_lock *fl; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); list_for_each_entry(fl, list, fl_list) if (fl->fl_file == filp) @@ -887,7 +888,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; struct file_lock_context *ctx; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); void *owner; void (*func)(void); @@ -1330,7 +1331,7 @@ retry: int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return posix_lock_inode(locks_inode(filp), fl, conflock); + return posix_lock_inode(file_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); @@ -1629,7 +1630,7 @@ EXPORT_SYMBOL(lease_get_mtime); int fcntl_getlease(struct file *filp) { struct file_lock *fl; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; int type = F_UNLCK; LIST_HEAD(dispose); @@ -1667,7 +1668,7 @@ int fcntl_getlease(struct file *filp) static int check_conflicting_open(struct file *filp, const long arg, int flags) { - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); int self_wcount = 0, self_rcount = 0; if (flags & FL_LAYOUT) @@ -1703,7 +1704,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) { struct file_lock *fl, *my_fl = NULL, *lease; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; @@ -1819,7 +1820,7 @@ static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; struct file_lock *fl, *victim = NULL; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; LIST_HEAD(dispose); @@ -1861,7 +1862,7 @@ static int generic_delete_lease(struct file *filp, void *owner) int generic_setlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); int error; if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) @@ -1889,7 +1890,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp, } EXPORT_SYMBOL(generic_setlease); -#if IS_ENABLED(CONFIG_SRCU) /* * Kernel subsystems can register to be notified on any attempt to set * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd @@ -1923,30 +1923,6 @@ void lease_unregister_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(lease_unregister_notifier); -#else /* !IS_ENABLED(CONFIG_SRCU) */ -static inline void -lease_notifier_chain_init(void) -{ -} - -static inline void -setlease_notifier(long arg, struct file_lock *lease) -{ -} - -int lease_register_notifier(struct notifier_block *nb) -{ - return 0; -} -EXPORT_SYMBOL_GPL(lease_register_notifier); - -void lease_unregister_notifier(struct notifier_block *nb) -{ -} -EXPORT_SYMBOL_GPL(lease_unregister_notifier); - -#endif /* IS_ENABLED(CONFIG_SRCU) */ - /** * vfs_setlease - sets a lease on an open file * @filp: file pointer @@ -2350,7 +2326,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, struct flock *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file *f; int error; @@ -2554,7 +2530,7 @@ out: void locks_remove_posix(struct file *filp, fl_owner_t owner) { int error; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock lock; struct file_lock_context *ctx; @@ -2591,7 +2567,7 @@ static void locks_remove_flock(struct file *filp, struct file_lock_context *flctx) { struct file_lock fl; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); if (list_empty(&flctx->flc_flock)) return; @@ -2636,7 +2612,7 @@ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; - ctx = locks_inode_context(locks_inode(filp)); + ctx = locks_inode_context(file_inode(filp)); if (!ctx) return; @@ -2720,7 +2696,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, */ if (fl->fl_file != NULL) - inode = locks_inode(fl->fl_file); + inode = file_inode(fl->fl_file); seq_printf(f, "%lld: ", id); @@ -2861,7 +2837,7 @@ static void __show_fd_locks(struct seq_file *f, void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) { - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; int id = 0; diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 9115948c624e..870207ba23f1 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -210,7 +210,7 @@ void minix_free_inode(struct inode * inode) mark_buffer_dirty(bh); } -struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error) +struct inode *minix_new_inode(const struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct minix_sb_info *sbi = minix_sb(sb); @@ -220,13 +220,10 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error) unsigned long j; int i; - if (!inode) { - *error = -ENOMEM; - return NULL; - } + if (!inode) + return ERR_PTR(-ENOMEM); j = bits_per_zone; bh = NULL; - *error = -ENOSPC; spin_lock(&bitmap_lock); for (i = 0; i < sbi->s_imap_blocks; i++) { bh = sbi->s_imap[i]; @@ -237,22 +234,22 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error) if (!bh || j >= bits_per_zone) { spin_unlock(&bitmap_lock); iput(inode); - return NULL; + return ERR_PTR(-ENOSPC); } if (minix_test_and_set_bit(j, bh->b_data)) { /* shouldn't happen */ spin_unlock(&bitmap_lock); printk("minix_new_inode: bit already set\n"); iput(inode); - return NULL; + return ERR_PTR(-ENOSPC); } spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); j += i * bits_per_zone; if (!j || j > sbi->s_ninodes) { iput(inode); - return NULL; + return ERR_PTR(-ENOSPC); } - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = j; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; @@ -260,7 +257,6 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error) insert_inode_hash(inode); mark_inode_dirty(inode); - *error = 0; return inode; } diff --git a/fs/minix/dir.c b/fs/minix/dir.c index dcfe5b25378b..bf9858f76b6a 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -46,21 +46,27 @@ minix_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - int err = 0; + block_write_end(NULL, mapping, pos, len, len, page, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } - if (IS_DIRSYNC(dir)) - err = write_one_page(page); - else - unlock_page(page); + unlock_page(page); +} + +static int minix_handle_dirsync(struct inode *dir) +{ + int err; + + err = filemap_write_and_wait(dir->i_mapping); + if (!err) + err = sync_inode_metadata(dir, 1); return err; } @@ -274,9 +280,10 @@ got_it: memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2); de->inode = inode->i_ino; } - err = dir_commit_chunk(page, pos, sbi->s_dirsize); + dir_commit_chunk(page, pos, sbi->s_dirsize); dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); + err = minix_handle_dirsync(dir); out_put: dir_put_page(page); out: @@ -297,19 +304,18 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page) lock_page(page); err = minix_prepare_chunk(page, pos, len); - if (err == 0) { - if (sbi->s_version == MINIX_V3) - ((minix3_dirent *) de)->inode = 0; - else - de->inode = 0; - err = dir_commit_chunk(page, pos, len); - } else { + if (err) { unlock_page(page); + return err; } - dir_put_page(page); + if (sbi->s_version == MINIX_V3) + ((minix3_dirent *)de)->inode = 0; + else + de->inode = 0; + dir_commit_chunk(page, pos, len); inode->i_ctime = inode->i_mtime = current_time(inode); mark_inode_dirty(inode); - return err; + return minix_handle_dirsync(inode); } int minix_make_empty(struct inode *inode, struct inode *dir) @@ -349,7 +355,8 @@ int minix_make_empty(struct inode *inode, struct inode *dir) } kunmap_atomic(kaddr); - err = dir_commit_chunk(page, 0, 2 * sbi->s_dirsize); + dir_commit_chunk(page, 0, 2 * sbi->s_dirsize); + err = minix_handle_dirsync(inode); fail: put_page(page); return err; @@ -409,8 +416,8 @@ not_empty: } /* Releases the page */ -void minix_set_link(struct minix_dir_entry *de, struct page *page, - struct inode *inode) +int minix_set_link(struct minix_dir_entry *de, struct page *page, + struct inode *inode) { struct inode *dir = page->mapping->host; struct minix_sb_info *sbi = minix_sb(dir->i_sb); @@ -419,20 +426,19 @@ void minix_set_link(struct minix_dir_entry *de, struct page *page, int err; lock_page(page); - err = minix_prepare_chunk(page, pos, sbi->s_dirsize); - if (err == 0) { - if (sbi->s_version == MINIX_V3) - ((minix3_dirent *) de)->inode = inode->i_ino; - else - de->inode = inode->i_ino; - err = dir_commit_chunk(page, pos, sbi->s_dirsize); - } else { + if (err) { unlock_page(page); + return err; } - dir_put_page(page); + if (sbi->s_version == MINIX_V3) + ((minix3_dirent *)de)->inode = inode->i_ino; + else + de->inode = inode->i_ino; + dir_commit_chunk(page, pos, sbi->s_dirsize); dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); + return minix_handle_dirsync(dir); } struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) diff --git a/fs/minix/file.c b/fs/minix/file.c index 6a7bd2d9eec0..0dd05d47724a 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -22,13 +22,13 @@ const struct file_operations minix_file_operations = { .splice_read = generic_file_splice_read, }; -static int minix_setattr(struct user_namespace *mnt_userns, +static int minix_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -42,7 +42,7 @@ static int minix_setattr(struct user_namespace *mnt_userns, minix_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index da8bdd1712a7..e9fbb5303a22 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -654,13 +654,13 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } -int minix_getattr(struct user_namespace *mnt_userns, const struct path *path, +int minix_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *sb = path->dentry->d_sb; struct inode *inode = d_inode(path->dentry); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (INODE_VERSION(inode) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 202173368025..d493507c064f 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -45,13 +45,13 @@ struct minix_sb_info { extern struct inode *minix_iget(struct super_block *, unsigned long); extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); -extern struct inode * minix_new_inode(const struct inode *, umode_t, int *); +extern struct inode * minix_new_inode(const struct inode *, umode_t); extern void minix_free_inode(struct inode * inode); extern unsigned long minix_count_free_inodes(struct super_block *sb); extern int minix_new_block(struct inode * inode); extern void minix_free_block(struct inode *inode, unsigned long block); extern unsigned long minix_count_free_blocks(struct super_block *sb); -extern int minix_getattr(struct user_namespace *, const struct path *, +extern int minix_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); @@ -69,7 +69,8 @@ extern int minix_add_link(struct dentry*, struct inode*); extern int minix_delete_entry(struct minix_dir_entry*, struct page*); extern int minix_make_empty(struct inode*, struct inode*); extern int minix_empty_dir(struct inode*); -extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*); +int minix_set_link(struct minix_dir_entry *de, struct page *page, + struct inode *inode); extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); extern ino_t minix_inode_by_name(struct dentry*); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 8afdc408ca4f..956d5183828d 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -33,71 +33,64 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un return d_splice_alias(inode, dentry); } -static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - int error; struct inode *inode; if (!old_valid_dev(rdev)) return -EINVAL; - inode = minix_new_inode(dir, mode, &error); + inode = minix_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); - if (inode) { - minix_set_inode(inode, rdev); - mark_inode_dirty(inode); - error = add_nondir(dentry, inode); - } - return error; + minix_set_inode(inode, rdev); + mark_inode_dirty(inode); + return add_nondir(dentry, inode); } -static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { - int error; - struct inode *inode = minix_new_inode(dir, mode, &error); - if (inode) { - minix_set_inode(inode, 0); - mark_inode_dirty(inode); - d_tmpfile(file, inode); - } - return finish_open_simple(file, error); + struct inode *inode = minix_new_inode(dir, mode); + + if (IS_ERR(inode)) + return finish_open_simple(file, PTR_ERR(inode)); + minix_set_inode(inode, 0); + mark_inode_dirty(inode); + d_tmpfile(file, inode); + return finish_open_simple(file, 0); } -static int minix_create(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return minix_mknod(mnt_userns, dir, dentry, mode, 0); + return minix_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } -static int minix_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { - int err = -ENAMETOOLONG; int i = strlen(symname)+1; struct inode * inode; + int err; if (i > dir->i_sb->s_blocksize) - goto out; + return -ENAMETOOLONG; - inode = minix_new_inode(dir, S_IFLNK | 0777, &err); - if (!inode) - goto out; + inode = minix_new_inode(dir, S_IFLNK | 0777); + if (IS_ERR(inode)) + return PTR_ERR(inode); minix_set_inode(inode, 0); err = page_symlink(inode, symname, i); - if (err) - goto out_fail; - - err = add_nondir(dentry, inode); -out: - return err; - -out_fail: - inode_dec_link_count(inode); - iput(inode); - goto out; + if (unlikely(err)) { + inode_dec_link_count(inode); + iput(inode); + return err; + } + return add_nondir(dentry, inode); } static int minix_link(struct dentry * old_dentry, struct inode * dir, @@ -111,20 +104,18 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int minix_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode * inode; int err; - inode_inc_link_count(dir); - - inode = minix_new_inode(dir, S_IFDIR | mode, &err); - if (!inode) - goto out_dir; + inode = minix_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + inode_inc_link_count(dir); minix_set_inode(inode, 0); - inode_inc_link_count(inode); err = minix_make_empty(inode, dir); @@ -143,30 +134,29 @@ out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); iput(inode); -out_dir: inode_dec_link_count(dir); goto out; } static int minix_unlink(struct inode * dir, struct dentry *dentry) { - int err = -ENOENT; struct inode * inode = d_inode(dentry); struct page * page; struct minix_dir_entry * de; + int err; de = minix_find_entry(dentry, &page); if (!de) - goto end_unlink; - + return -ENOENT; err = minix_delete_entry(de, page); - if (err) - goto end_unlink; + kunmap(page); + put_page(page); + if (err) + return err; inode->i_ctime = dir->i_ctime; inode_dec_link_count(inode); -end_unlink: - return err; + return 0; } static int minix_rmdir(struct inode * dir, struct dentry *dentry) @@ -184,7 +174,7 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry) return err; } -static int minix_rename(struct user_namespace *mnt_userns, +static int minix_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -223,7 +213,11 @@ static int minix_rename(struct user_namespace *mnt_userns, new_de = minix_find_entry(new_dentry, &new_page); if (!new_de) goto out_dir; - minix_set_link(new_de, new_page, old_inode); + err = minix_set_link(new_de, new_page, old_inode); + kunmap(new_page); + put_page(new_page); + if (err) + goto out_dir; new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); @@ -236,15 +230,17 @@ static int minix_rename(struct user_namespace *mnt_userns, inode_inc_link_count(new_dir); } - minix_delete_entry(old_de, old_page); + err = minix_delete_entry(old_de, old_page); + if (err) + goto out_dir; + mark_inode_dirty(old_inode); if (dir_de) { - minix_set_link(dir_de, dir_page, new_dir); - inode_dec_link_count(old_dir); + err = minix_set_link(dir_de, dir_page, new_dir); + if (!err) + inode_dec_link_count(old_dir); } - return 0; - out_dir: if (dir_de) { kunmap(dir_page); diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c new file mode 100644 index 000000000000..4905665c47d0 --- /dev/null +++ b/fs/mnt_idmapping.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Christian Brauner <brauner@kernel.org> */ + +#include <linux/cred.h> +#include <linux/fs.h> +#include <linux/mnt_idmapping.h> +#include <linux/slab.h> +#include <linux/user_namespace.h> + +#include "internal.h" + +struct mnt_idmap { + struct user_namespace *owner; + refcount_t count; +}; + +/* + * Carries the initial idmapping of 0:0:4294967295 which is an identity + * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is + * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. + */ +struct mnt_idmap nop_mnt_idmap = { + .owner = &init_user_ns, + .count = REFCOUNT_INIT(1), +}; +EXPORT_SYMBOL_GPL(nop_mnt_idmap); + +/** + * check_fsmapping - check whether an mount idmapping is allowed + * @idmap: idmap of the relevent mount + * @sb: super block of the filesystem + * + * Return: true if @idmap is allowed, false if not. + */ +bool check_fsmapping(const struct mnt_idmap *idmap, + const struct super_block *sb) +{ + return idmap->owner != sb->s_user_ns; +} + +/** + * initial_idmapping - check whether this is the initial mapping + * @ns: idmapping to check + * + * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1, + * [...], 1000 to 1000 [...]. + * + * Return: true if this is the initial mapping, false if not. + */ +static inline bool initial_idmapping(const struct user_namespace *ns) +{ + return ns == &init_user_ns; +} + +/** + * no_idmapping - check whether we can skip remapping a kuid/gid + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * + * This function can be used to check whether a remapping between two + * idmappings is required. + * An idmapped mount is a mount that has an idmapping attached to it that + * is different from the filsystem's idmapping and the initial idmapping. + * If the initial mapping is used or the idmapping of the mount and the + * filesystem are identical no remapping is required. + * + * Return: true if remapping can be skipped, false if not. + */ +static inline bool no_idmapping(const struct user_namespace *mnt_userns, + const struct user_namespace *fs_userns) +{ + return initial_idmapping(mnt_userns) || mnt_userns == fs_userns; +} + +/** + * make_vfsuid - map a filesystem kuid according to an idmapping + * @idmap: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @kuid : kuid to be mapped + * + * Take a @kuid and remap it from @fs_userns into @idmap. Use this + * function when preparing a @kuid to be reported to userspace. + * + * If no_idmapping() determines that this is not an idmapped mount we can + * simply return @kuid unchanged. + * If initial_idmapping() tells us that the filesystem is not mounted with an + * idmapping we know the value of @kuid won't change when calling + * from_kuid() so we can simply retrieve the value via __kuid_val() + * directly. + * + * Return: @kuid mapped according to @idmap. + * If @kuid has no mapping in either @idmap or @fs_userns INVALID_UID is + * returned. + */ + +vfsuid_t make_vfsuid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, + kuid_t kuid) +{ + uid_t uid; + struct user_namespace *mnt_userns = idmap->owner; + + if (no_idmapping(mnt_userns, fs_userns)) + return VFSUIDT_INIT(kuid); + if (initial_idmapping(fs_userns)) + uid = __kuid_val(kuid); + else + uid = from_kuid(fs_userns, kuid); + if (uid == (uid_t)-1) + return INVALID_VFSUID; + return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); +} +EXPORT_SYMBOL_GPL(make_vfsuid); + +/** + * make_vfsgid - map a filesystem kgid according to an idmapping + * @idmap: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @kgid : kgid to be mapped + * + * Take a @kgid and remap it from @fs_userns into @idmap. Use this + * function when preparing a @kgid to be reported to userspace. + * + * If no_idmapping() determines that this is not an idmapped mount we can + * simply return @kgid unchanged. + * If initial_idmapping() tells us that the filesystem is not mounted with an + * idmapping we know the value of @kgid won't change when calling + * from_kgid() so we can simply retrieve the value via __kgid_val() + * directly. + * + * Return: @kgid mapped according to @idmap. + * If @kgid has no mapping in either @idmap or @fs_userns INVALID_GID is + * returned. + */ +vfsgid_t make_vfsgid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, kgid_t kgid) +{ + gid_t gid; + struct user_namespace *mnt_userns = idmap->owner; + + if (no_idmapping(mnt_userns, fs_userns)) + return VFSGIDT_INIT(kgid); + if (initial_idmapping(fs_userns)) + gid = __kgid_val(kgid); + else + gid = from_kgid(fs_userns, kgid); + if (gid == (gid_t)-1) + return INVALID_VFSGID; + return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); +} +EXPORT_SYMBOL_GPL(make_vfsgid); + +/** + * from_vfsuid - map a vfsuid into the filesystem idmapping + * @idmap: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @vfsuid : vfsuid to be mapped + * + * Map @vfsuid into the filesystem idmapping. This function has to be used in + * order to e.g. write @vfsuid to inode->i_uid. + * + * Return: @vfsuid mapped into the filesystem idmapping + */ +kuid_t from_vfsuid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, vfsuid_t vfsuid) +{ + uid_t uid; + struct user_namespace *mnt_userns = idmap->owner; + + if (no_idmapping(mnt_userns, fs_userns)) + return AS_KUIDT(vfsuid); + uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); + if (uid == (uid_t)-1) + return INVALID_UID; + if (initial_idmapping(fs_userns)) + return KUIDT_INIT(uid); + return make_kuid(fs_userns, uid); +} +EXPORT_SYMBOL_GPL(from_vfsuid); + +/** + * from_vfsgid - map a vfsgid into the filesystem idmapping + * @idmap: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @vfsgid : vfsgid to be mapped + * + * Map @vfsgid into the filesystem idmapping. This function has to be used in + * order to e.g. write @vfsgid to inode->i_gid. + * + * Return: @vfsgid mapped into the filesystem idmapping + */ +kgid_t from_vfsgid(struct mnt_idmap *idmap, + struct user_namespace *fs_userns, vfsgid_t vfsgid) +{ + gid_t gid; + struct user_namespace *mnt_userns = idmap->owner; + + if (no_idmapping(mnt_userns, fs_userns)) + return AS_KGIDT(vfsgid); + gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); + if (gid == (gid_t)-1) + return INVALID_GID; + if (initial_idmapping(fs_userns)) + return KGIDT_INIT(gid); + return make_kgid(fs_userns, gid); +} +EXPORT_SYMBOL_GPL(from_vfsgid); + +#ifdef CONFIG_MULTIUSER +/** + * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups + * @vfsgid: the mnt gid to match + * + * This function can be used to determine whether @vfsuid matches any of the + * caller's groups. + * + * Return: 1 if vfsuid matches caller's groups, 0 if not. + */ +int vfsgid_in_group_p(vfsgid_t vfsgid) +{ + return in_group_p(AS_KGIDT(vfsgid)); +} +#else +int vfsgid_in_group_p(vfsgid_t vfsgid) +{ + return 1; +} +#endif +EXPORT_SYMBOL_GPL(vfsgid_in_group_p); + +struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) +{ + struct mnt_idmap *idmap; + + idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); + if (!idmap) + return ERR_PTR(-ENOMEM); + + idmap->owner = get_user_ns(mnt_userns); + refcount_set(&idmap->count, 1); + return idmap; +} + +/** + * mnt_idmap_get - get a reference to an idmapping + * @idmap: the idmap to bump the reference on + * + * If @idmap is not the @nop_mnt_idmap bump the reference count. + * + * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. + */ +struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap) + refcount_inc(&idmap->count); + + return idmap; +} + +/** + * mnt_idmap_put - put a reference to an idmapping + * @idmap: the idmap to put the reference on + * + * If this is a non-initial idmapping, put the reference count when a mount is + * released and free it if we're the last user. + */ +void mnt_idmap_put(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { + put_user_ns(idmap->owner); + kfree(idmap); + } +} diff --git a/fs/mpage.c b/fs/mpage.c index 0f8ae954a579..22b9de5ddd68 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -198,7 +198,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) /* * Then do more get_blocks calls until we are done with this folio. */ - map_bh->b_page = &folio->page; + map_bh->b_folio = folio; while (page_block < blocks_per_page) { map_bh->b_state = 0; map_bh->b_size = 0; @@ -269,11 +269,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) alloc_new: if (args->bio == NULL) { - if (first_hole == blocks_per_page) { - if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), - &folio->page)) - goto out; - } args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, gfp); if (args->bio == NULL) @@ -445,15 +440,14 @@ void clean_page_buffers(struct page *page) clean_buffers(page, ~0U); } -static int __mpage_writepage(struct page *page, struct writeback_control *wbc, +static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct mpage_data *mpd = data; struct bio *bio = mpd->bio; - struct address_space *mapping = page->mapping; - struct inode *inode = page->mapping->host; + struct address_space *mapping = folio->mapping; + struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; - unsigned long end_index; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; sector_t last_block; sector_t block_in_file; @@ -464,13 +458,13 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, int boundary = 0; sector_t boundary_block = 0; struct block_device *boundary_bdev = NULL; - int length; + size_t length; struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; + struct buffer_head *head = folio_buffers(folio); - if (page_has_buffers(page)) { - struct buffer_head *head = page_buffers(page); + if (head) { struct buffer_head *bh = head; /* If they're all mapped and dirty, do it */ @@ -522,16 +516,24 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, /* * The page has no buffers: map it to disk */ - BUG_ON(!PageUptodate(page)); - block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); + BUG_ON(!folio_test_uptodate(folio)); + block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); + /* + * Whole page beyond EOF? Skip allocating blocks to avoid leaking + * space. + */ + if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) + goto page_is_mapped; last_block = (i_size - 1) >> blkbits; - map_bh.b_page = page; + map_bh.b_folio = folio; for (page_block = 0; page_block < blocks_per_page; ) { map_bh.b_state = 0; map_bh.b_size = 1 << blkbits; if (mpd->get_block(inode, block_in_file, &map_bh, 1)) goto confused; + if (!buffer_mapped(&map_bh)) + goto confused; if (buffer_new(&map_bh)) clean_bdev_bh_alias(&map_bh); if (buffer_boundary(&map_bh)) { @@ -554,8 +556,11 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, first_unmapped = page_block; page_is_mapped: - end_index = i_size >> PAGE_SHIFT; - if (page->index >= end_index) { + /* Don't bother writing beyond EOF, truncate will discard the folio */ + if (folio_pos(folio) >= i_size) + goto confused; + length = folio_size(folio); + if (folio_pos(folio) + length > i_size) { /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. @@ -564,11 +569,8 @@ page_is_mapped: * is zeroed when mapped, and writes to that region are not * written out to the file." */ - unsigned offset = i_size & (PAGE_SIZE - 1); - - if (page->index > end_index || !offset) - goto confused; - zero_user_segment(page, offset, PAGE_SIZE); + length = i_size - folio_pos(folio); + folio_zero_segment(folio, length, folio_size(folio)); } /* @@ -579,11 +581,6 @@ page_is_mapped: alloc_new: if (bio == NULL) { - if (first_unmapped == blocks_per_page) { - if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), - page, wbc)) - goto out; - } bio = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS); @@ -596,18 +593,18 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_cgroup_owner(wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); length = first_unmapped << blkbits; - if (bio_add_page(bio, page, length, 0) < length) { + if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit(bio); goto alloc_new; } - clean_buffers(page, first_unmapped); + clean_buffers(&folio->page, first_unmapped); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); if (boundary || (first_unmapped != blocks_per_page)) { bio = mpage_bio_submit(bio); if (boundary_block) { @@ -626,7 +623,7 @@ confused: /* * The caller has a ref on the inode, so *mapping is stable */ - ret = block_write_full_page(page, mpd->get_block, wbc); + ret = block_write_full_page(&folio->page, mpd->get_block, wbc); mapping_set_error(mapping, ret); out: mpd->bio = bio; @@ -641,14 +638,6 @@ out: * * This is a library function, which implements the writepages() * address_space_operation. - * - * If a page is already under I/O, generic_writepages() skips it, even - * if it's dirty. This is desirable behaviour for memory-cleaning writeback, - * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. If wbc->sync_mode is - * WB_SYNC_ALL then we were called for data integrity and we must wait for - * existing IO to complete. */ int mpage_writepages(struct address_space *mapping, diff --git a/fs/namei.c b/fs/namei.c index 309ae6fc8c99..edfedfbccaef 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -20,6 +20,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> @@ -273,7 +274,7 @@ void putname(struct filename *name) /** * check_acl - perform ACL permission checking - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * @@ -281,13 +282,13 @@ void putname(struct filename *name) * retrieve POSIX acls it needs to know whether it is called from a blocking or * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -static int check_acl(struct user_namespace *mnt_userns, +static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL @@ -300,14 +301,14 @@ static int check_acl(struct user_namespace *mnt_userns, /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; - return posix_acl_permission(mnt_userns, inode, acl, mask); + return posix_acl_permission(idmap, inode, acl, mask); } acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { - int error = posix_acl_permission(mnt_userns, inode, acl, mask); + int error = posix_acl_permission(idmap, inode, acl, mask); posix_acl_release(acl); return error; } @@ -318,7 +319,7 @@ static int check_acl(struct user_namespace *mnt_userns, /** * acl_permission_check - perform basic UNIX permission checking - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * @@ -326,20 +327,20 @@ static int check_acl(struct user_namespace *mnt_userns, * function may retrieve POSIX acls it needs to know whether it is called from a * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -static int acl_permission_check(struct user_namespace *mnt_userns, +static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; vfsuid_t vfsuid; /* Are we the owner? If so, ACL's don't matter */ - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; @@ -348,7 +349,7 @@ static int acl_permission_check(struct user_namespace *mnt_userns, /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { - int error = check_acl(mnt_userns, inode, mask); + int error = check_acl(idmap, inode, mask); if (error != -EAGAIN) return error; } @@ -362,7 +363,7 @@ static int acl_permission_check(struct user_namespace *mnt_userns, * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } @@ -373,7 +374,7 @@ static int acl_permission_check(struct user_namespace *mnt_userns, /** * generic_permission - check for access rights on a Posix-like filesystem - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) @@ -387,13 +388,13 @@ static int acl_permission_check(struct user_namespace *mnt_userns, * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, +int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; @@ -401,17 +402,17 @@ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, /* * Do the basic permission checks. */ - ret = acl_permission_check(mnt_userns, inode, mask); + ret = acl_permission_check(idmap, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; @@ -422,7 +423,7 @@ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; /* @@ -431,7 +432,7 @@ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (capable_wrt_inode_uidgid(mnt_userns, inode, + if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; @@ -441,7 +442,7 @@ EXPORT_SYMBOL(generic_permission); /** * do_inode_permission - UNIX permission checking - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * @@ -450,19 +451,19 @@ EXPORT_SYMBOL(generic_permission); * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ -static inline int do_inode_permission(struct user_namespace *mnt_userns, +static inline int do_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) - return inode->i_op->permission(mnt_userns, inode, mask); + return inode->i_op->permission(idmap, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } /** @@ -487,7 +488,7 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) /** * inode_permission - Check for access rights to a given inode - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * @@ -497,7 +498,7 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ -int inode_permission(struct user_namespace *mnt_userns, +int inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int retval; @@ -518,11 +519,11 @@ int inode_permission(struct user_namespace *mnt_userns, * written back improperly if their true value is unknown * to the vfs. */ - if (HAS_UNMAPPED_ID(mnt_userns, inode)) + if (HAS_UNMAPPED_ID(idmap, inode)) return -EACCES; } - retval = do_inode_permission(mnt_userns, inode, mask); + retval = do_inode_permission(idmap, inode, mask); if (retval) return retval; @@ -1094,14 +1095,14 @@ fs_initcall(init_fs_namei_sysctls); */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; - mnt_userns = mnt_user_ns(nd->path.mnt); - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + idmap = mnt_idmap(nd->path.mnt); + vfsuid = i_uid_into_vfsuid(idmap, inode); /* Allowed if owner and follower match. */ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; @@ -1124,7 +1125,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod /** * safe_hardlink_source - Check for safe hardlink conditions - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: @@ -1135,7 +1136,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod * * Otherwise returns true. */ -static bool safe_hardlink_source(struct user_namespace *mnt_userns, +static bool safe_hardlink_source(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; @@ -1153,7 +1154,7 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ - if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE)) + if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) return false; return true; @@ -1161,8 +1162,8 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, /** * may_linkat - Check permissions for creating a hardlink - * @mnt_userns: user namespace of the mount the inode was found from - * @link: the source to hardlink from + * @idmap: idmap of the mount the inode was found from + * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled @@ -1170,21 +1171,21 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if successful, -ve on error. */ -int may_linkat(struct user_namespace *mnt_userns, const struct path *link) +int may_linkat(struct mnt_idmap *idmap, const struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || - !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) @@ -1193,8 +1194,8 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link) /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (safe_hardlink_source(mnt_userns, inode) || - inode_owner_or_capable(mnt_userns, inode)) + if (safe_hardlink_source(idmap, inode) || + inode_owner_or_capable(idmap, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); @@ -1205,7 +1206,7 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link) * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @nd: nameidata pathwalk data * @inode: the inode of the file to open * @@ -1220,15 +1221,15 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link) * the directory doesn't have to be world writable: being group writable will * be enough. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if the open is allowed, -ve on error. */ -static int may_create_in_sticky(struct user_namespace *mnt_userns, +static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; @@ -1237,8 +1238,8 @@ static int may_create_in_sticky(struct user_namespace *mnt_userns, if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || likely(!(dir_mode & S_ISVTX)) || - vfsuid_eq(i_uid_into_vfsuid(mnt_userns, inode), dir_vfsuid) || - vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid())) + vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) || + vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) return 0; if (likely(dir_mode & 0002) || @@ -1458,11 +1459,11 @@ EXPORT_SYMBOL(follow_down_one); * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. */ -int follow_down(struct path *path) +int follow_down(struct path *path, unsigned int flags) { struct vfsmount *mnt = path->mnt; bool jumped; - int ret = traverse_mounts(path, &jumped, NULL, 0); + int ret = traverse_mounts(path, &jumped, NULL, flags); if (path->mnt != mnt) mntput(mnt); @@ -1704,15 +1705,15 @@ static struct dentry *lookup_slow(const struct qstr *name, return res; } -static inline int may_lookup(struct user_namespace *mnt_userns, +static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { - int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); + int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD || !try_to_unlazy(nd)) return err; } - return inode_permission(mnt_userns, nd->inode, MAY_EXEC); + return inode_permission(idmap, nd->inode, MAY_EXEC); } static int reserve_stack(struct nameidata *nd, struct path *link) @@ -2253,13 +2254,13 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; const char *link; u64 hash_len; int type; - mnt_userns = mnt_user_ns(nd->path.mnt); - err = may_lookup(mnt_userns, nd); + idmap = mnt_idmap(nd->path.mnt); + err = may_lookup(idmap, nd); if (err) return err; @@ -2307,7 +2308,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) OK: /* pathname or trailing symlink, done */ if (!depth) { - nd->dir_vfsuid = i_uid_into_vfsuid(mnt_userns, nd->inode); + nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; @@ -2622,7 +2623,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); -static int lookup_one_common(struct user_namespace *mnt_userns, +static int lookup_one_common(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len, struct qstr *this) { @@ -2652,7 +2653,7 @@ static int lookup_one_common(struct user_namespace *mnt_userns, return err; } - return inode_permission(mnt_userns, base->d_inode, MAY_EXEC); + return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** @@ -2676,7 +2677,7 @@ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(&init_user_ns, name, base, len, &this); + err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2703,7 +2704,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(&init_user_ns, name, base, len, &this); + err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2714,7 +2715,7 @@ EXPORT_SYMBOL(lookup_one_len); /** * lookup_one - filesystem helper to lookup single pathname component - * @mnt_userns: user namespace of the mount the lookup is performed from + * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to @@ -2724,7 +2725,7 @@ EXPORT_SYMBOL(lookup_one_len); * * The caller must hold base->i_mutex. */ -struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name, +struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *dentry; @@ -2733,7 +2734,7 @@ struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name, WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(mnt_userns, name, base, len, &this); + err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2744,7 +2745,7 @@ EXPORT_SYMBOL(lookup_one); /** * lookup_one_unlocked - filesystem helper to lookup single pathname component - * @mnt_userns: idmapping of the mount the lookup is performed from + * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to @@ -2755,7 +2756,7 @@ EXPORT_SYMBOL(lookup_one); * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ -struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns, +struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { @@ -2763,7 +2764,7 @@ struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns, int err; struct dentry *ret; - err = lookup_one_common(mnt_userns, name, base, len, &this); + err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2777,7 +2778,7 @@ EXPORT_SYMBOL(lookup_one_unlocked); /** * lookup_one_positive_unlocked - filesystem helper to lookup single * pathname component - * @mnt_userns: idmapping of the mount the lookup is performed from + * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to @@ -2794,11 +2795,11 @@ EXPORT_SYMBOL(lookup_one_unlocked); * * The helper should be called without i_mutex held. */ -struct dentry *lookup_one_positive_unlocked(struct user_namespace *mnt_userns, +struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { - struct dentry *ret = lookup_one_unlocked(mnt_userns, name, base, len); + struct dentry *ret = lookup_one_unlocked(idmap, name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); @@ -2823,7 +2824,7 @@ EXPORT_SYMBOL(lookup_one_positive_unlocked); struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { - return lookup_one_unlocked(&init_user_ns, name, base, len); + return lookup_one_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -2838,7 +2839,7 @@ EXPORT_SYMBOL(lookup_one_len_unlocked); struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { - return lookup_one_positive_unlocked(&init_user_ns, name, base, len); + return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_positive_unlocked); @@ -2864,7 +2865,7 @@ int path_pts(struct path *path) path->dentry = child; dput(parent); - follow_down(path); + follow_down(path, 0); return 0; } #endif @@ -2880,16 +2881,16 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags, } EXPORT_SYMBOL(user_path_at_empty); -int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir, +int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) return 0; - if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, dir), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) return 0; - return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER); + return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); @@ -2913,7 +2914,7 @@ EXPORT_SYMBOL(__check_sticky); * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, +static int may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); @@ -2926,21 +2927,21 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || - !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) || + if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || - HAS_UNMAPPED_ID(mnt_userns, inode)) + HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -2965,7 +2966,7 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ -static inline int may_create(struct user_namespace *mnt_userns, +static inline int may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); @@ -2973,10 +2974,10 @@ static inline int may_create(struct user_namespace *mnt_userns, return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns)) + if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; - return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } /* @@ -3044,7 +3045,7 @@ static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) /** * vfs_prepare_mode - prepare the mode to be used for a new inode - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode * @mode: mode of the new inode * @mask_perms: allowed permission by the vfs @@ -3065,11 +3066,11 @@ static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) * * Returns: mode to be passed to the filesystem */ -static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns, +static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode, umode_t mask_perms, umode_t type) { - mode = mode_strip_sgid(mnt_userns, dir, mode); + mode = mode_strip_sgid(idmap, dir, mode); mode = mode_strip_umask(dir, mode); /* @@ -3084,7 +3085,7 @@ static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns, /** * vfs_create - create new file - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new file @@ -3092,27 +3093,29 @@ static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns, * * Create a new file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_create(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) { - int error = may_create(mnt_userns, dir, dentry); + int error; + + error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ - mode = vfs_prepare_mode(mnt_userns, dir, mode, S_IALLUGO, S_IFREG); + mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; - error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl); + error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; @@ -3124,7 +3127,7 @@ int vfs_mkobj(struct dentry *dentry, umode_t mode, void *arg) { struct inode *dir = dentry->d_parent->d_inode; - int error = may_create(&init_user_ns, dir, dentry); + int error = may_create(&nop_mnt_idmap, dir, dentry); if (error) return error; @@ -3146,7 +3149,7 @@ bool may_open_dev(const struct path *path) !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } -static int may_open(struct user_namespace *mnt_userns, const struct path *path, +static int may_open(struct mnt_idmap *idmap, const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; @@ -3182,7 +3185,7 @@ static int may_open(struct user_namespace *mnt_userns, const struct path *path, break; } - error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode); + error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); if (error) return error; @@ -3197,13 +3200,13 @@ static int may_open(struct user_namespace *mnt_userns, const struct path *path, } /* O_NOATIME can only be set by the owner or superuser */ - if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode)) + if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) return -EPERM; return 0; } -static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) +static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; @@ -3213,7 +3216,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) error = security_file_truncate(filp); if (!error) { - error = do_truncate(mnt_userns, path->dentry, 0, + error = do_truncate(idmap, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } @@ -3228,7 +3231,7 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int may_o_create(struct user_namespace *mnt_userns, +static int may_o_create(struct mnt_idmap *idmap, const struct path *dir, struct dentry *dentry, umode_t mode) { @@ -3236,10 +3239,10 @@ static int may_o_create(struct user_namespace *mnt_userns, if (error) return error; - if (!fsuidgid_has_mapping(dir->dentry->d_sb, mnt_userns)) + if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) return -EOVERFLOW; - error = inode_permission(mnt_userns, dir->dentry->d_inode, + error = inode_permission(idmap, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -3319,7 +3322,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, bool got_write) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; @@ -3367,13 +3370,13 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; - mnt_userns = mnt_user_ns(nd->path.mnt); + idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; - mode = vfs_prepare_mode(mnt_userns, dir->d_inode, mode, mode, mode); + mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); if (likely(got_write)) - create_error = may_o_create(mnt_userns, &nd->path, + create_error = may_o_create(idmap, &nd->path, dentry, mode); else create_error = -EROFS; @@ -3410,7 +3413,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, goto out_dput; } - error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry, + error = dir_inode->i_op->create(idmap, dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; @@ -3513,7 +3516,7 @@ finish_lookup: static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; int open_flag = op->open_flag; bool do_truncate; int acc_mode; @@ -3526,13 +3529,13 @@ static int do_open(struct nameidata *nd, } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); - mnt_userns = mnt_user_ns(nd->path.mnt); + idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; - error = may_create_in_sticky(mnt_userns, nd, + error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; @@ -3552,13 +3555,13 @@ static int do_open(struct nameidata *nd, return error; do_truncate = true; } - error = may_open(mnt_userns, &nd->path, acc_mode, open_flag); + error = may_open(idmap, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) error = ima_file_check(file, op->acc_mode); if (!error && do_truncate) - error = handle_truncate(mnt_userns, file); + error = handle_truncate(idmap, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; @@ -3570,20 +3573,20 @@ static int do_open(struct nameidata *nd, /** * vfs_tmpfile - create tmpfile - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: pointer to dentry of the base directory * @mode: mode of the new tmpfile * @open_flag: flags * * Create a temporary file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -static int vfs_tmpfile(struct user_namespace *mnt_userns, +static int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode) { @@ -3594,7 +3597,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, int open_flag = file->f_flags; /* we want directory to be writable */ - error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (!dir->i_op->tmpfile) @@ -3604,13 +3607,13 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, return -ENOMEM; file->f_path.mnt = parentpath->mnt; file->f_path.dentry = child; - mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode); - error = dir->i_op->tmpfile(mnt_userns, dir, file, mode); + mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); + error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); if (error) return error; /* Don't check for other permissions, the inode was just created */ - error = may_open(mnt_userns, &file->f_path, 0, file->f_flags); + error = may_open(idmap, &file->f_path, 0, file->f_flags); if (error) return error; inode = file_inode(file); @@ -3619,13 +3622,13 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } - ima_post_create_tmpfile(mnt_userns, inode); + ima_post_create_tmpfile(idmap, inode); return 0; } /** * vfs_tmpfile_open - open a tmpfile for kernel internal use - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile * @open_flag: flags @@ -3635,7 +3638,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ -struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, +struct file *vfs_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred) { @@ -3644,7 +3647,7 @@ struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, file = alloc_empty_file_noaccount(open_flag, cred); if (!IS_ERR(file)) { - error = vfs_tmpfile(mnt_userns, parentpath, file, mode); + error = vfs_tmpfile(idmap, parentpath, file, mode); if (error) { fput(file); file = ERR_PTR(error); @@ -3658,7 +3661,6 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { - struct user_namespace *mnt_userns; struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); @@ -3667,8 +3669,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_tmpfile(mnt_userns, &path, file, op->mode); + error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); if (error) goto out2; audit_inode(nd->name, file->f_path.dentry, 0); @@ -3873,7 +3874,7 @@ EXPORT_SYMBOL(user_path_create); /** * vfs_mknod - create device node or file - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new device node or file @@ -3881,17 +3882,17 @@ EXPORT_SYMBOL(user_path_create); * * Create a device node or file. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; - int error = may_create(mnt_userns, dir, dentry); + int error = may_create(idmap, dir, dentry); if (error) return error; @@ -3903,7 +3904,7 @@ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (!dir->i_op->mknod) return -EPERM; - mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode); + mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = devcgroup_inode_mknod(mode, dev); if (error) return error; @@ -3912,7 +3913,7 @@ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (error) return error; - error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev); + error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; @@ -3939,7 +3940,7 @@ static int may_mknod(umode_t mode) static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct dentry *dentry; struct path path; int error; @@ -3959,20 +3960,20 @@ retry: if (error) goto out2; - mnt_userns = mnt_user_ns(path.mnt); + idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(mnt_userns, path.dentry->d_inode, + error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); if (!error) - ima_post_path_mknod(mnt_userns, dentry); + ima_post_path_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: - error = vfs_mknod(mnt_userns, path.dentry->d_inode, + error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: - error = vfs_mknod(mnt_userns, path.dentry->d_inode, + error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, 0); break; } @@ -4000,32 +4001,33 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d /** * vfs_mkdir - create directory - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new directory * * Create a directory. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - int error = may_create(mnt_userns, dir, dentry); + int error; unsigned max_links = dir->i_sb->s_max_links; + error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->mkdir) return -EPERM; - mode = vfs_prepare_mode(mnt_userns, dir, mode, S_IRWXUGO | S_ISVTX, 0); + mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) return error; @@ -4033,7 +4035,7 @@ int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (max_links && dir->i_nlink >= max_links) return -EMLINK; - error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode); + error = dir->i_op->mkdir(idmap, dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); return error; @@ -4056,10 +4058,8 @@ retry: error = security_path_mkdir(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { - struct user_namespace *mnt_userns; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry, - mode); + error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, mode); } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { @@ -4083,22 +4083,22 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) /** * vfs_rmdir - remove directory - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * * Remove a directory. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { - int error = may_delete(mnt_userns, dir, dentry, 1); + int error = may_delete(idmap, dir, dentry, 1); if (error) return error; @@ -4138,7 +4138,6 @@ EXPORT_SYMBOL(vfs_rmdir); int do_rmdir(int dfd, struct filename *name) { - struct user_namespace *mnt_userns; int error; struct dentry *dentry; struct path path; @@ -4178,8 +4177,7 @@ retry: error = security_path_rmdir(&path, dentry); if (error) goto exit4; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry); + error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); exit4: dput(dentry); exit3: @@ -4203,7 +4201,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) /** * vfs_unlink - unlink a filesystem object - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. @@ -4220,17 +4218,17 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; - int error = may_delete(mnt_userns, dir, dentry, 0); + int error = may_delete(idmap, dir, dentry, 0); if (error) return error; @@ -4304,7 +4302,6 @@ retry_deleg: dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { - struct user_namespace *mnt_userns; /* Why not before? Because we want correct error value */ if (last.name[last.len]) @@ -4316,9 +4313,8 @@ retry_deleg: error = security_path_unlink(&path, dentry); if (error) goto exit3; - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry, - &delegated_inode); + error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, &delegated_inode); exit3: dput(dentry); } @@ -4370,24 +4366,25 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) /** * vfs_symlink - create symlink - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @oldname: name of the file to link to * * Create a symlink. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) { - int error = may_create(mnt_userns, dir, dentry); + int error; + error = may_create(idmap, dir, dentry); if (error) return error; @@ -4398,7 +4395,7 @@ int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (error) return error; - error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname); + error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; @@ -4423,13 +4420,9 @@ retry: goto out_putnames; error = security_path_symlink(&path, dentry, from->name); - if (!error) { - struct user_namespace *mnt_userns; - - mnt_userns = mnt_user_ns(path.mnt); - error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry, - from->name); - } + if (!error) + error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, from->name); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -4455,7 +4448,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn /** * vfs_link - create a new link * @old_dentry: object to be linked - * @mnt_userns: the user namespace of the mount + * @idmap: idmap of the mount * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break @@ -4472,13 +4465,13 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then take - * care to map the inode according to @mnt_userns before checking permissions. + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then take + * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs init_user_ns. + * raw inode simply passs @nop_mnt_idmap. */ -int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, +int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { @@ -4489,7 +4482,7 @@ int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, if (!inode) return -ENOENT; - error = may_create(mnt_userns, dir, new_dentry); + error = may_create(idmap, dir, new_dentry); if (error) return error; @@ -4506,7 +4499,7 @@ int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, * be writen back improperly if their true value is unknown to * the vfs. */ - if (HAS_UNMAPPED_ID(mnt_userns, inode)) + if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; @@ -4553,7 +4546,7 @@ EXPORT_SYMBOL(vfs_link); int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; struct inode *delegated_inode = NULL; @@ -4590,14 +4583,14 @@ retry: error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - mnt_userns = mnt_user_ns(new_path.mnt); - error = may_linkat(mnt_userns, &old_path); + idmap = mnt_idmap(new_path.mnt); + error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode, + error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); @@ -4697,20 +4690,20 @@ int vfs_rename(struct renamedata *rd) if (source == target) return 0; - error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir); + error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { - error = may_create(rd->new_mnt_userns, new_dir, new_dentry); + error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) - error = may_delete(rd->new_mnt_userns, new_dir, + error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, is_dir); else - error = may_delete(rd->new_mnt_userns, new_dir, + error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) @@ -4725,13 +4718,13 @@ int vfs_rename(struct renamedata *rd) */ if (new_dir != old_dir) { if (is_dir) { - error = inode_permission(rd->old_mnt_userns, source, + error = inode_permission(rd->old_mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { - error = inode_permission(rd->new_mnt_userns, target, + error = inode_permission(rd->new_mnt_idmap, target, MAY_WRITE); if (error) return error; @@ -4776,7 +4769,7 @@ int vfs_rename(struct renamedata *rd) if (error) goto out; } - error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry, + error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; @@ -4921,10 +4914,10 @@ retry_deleg: rd.old_dir = old_path.dentry->d_inode; rd.old_dentry = old_dentry; - rd.old_mnt_userns = mnt_user_ns(old_path.mnt); + rd.old_mnt_idmap = mnt_idmap(old_path.mnt); rd.new_dir = new_path.dentry->d_inode; rd.new_dentry = new_dentry; - rd.new_mnt_userns = mnt_user_ns(new_path.mnt); + rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); diff --git a/fs/namespace.c b/fs/namespace.c index ab467ee58341..bc0f15257b49 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -75,22 +75,6 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ -struct mnt_idmap { - struct user_namespace *owner; - refcount_t count; -}; - -/* - * Carries the initial idmapping of 0:0:4294967295 which is an identity - * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is - * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. - */ -struct mnt_idmap nop_mnt_idmap = { - .owner = &init_user_ns, - .count = REFCOUNT_INIT(1), -}; -EXPORT_SYMBOL_GPL(nop_mnt_idmap); - struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; @@ -210,104 +194,6 @@ int mnt_get_count(struct mount *mnt) #endif } -/** - * mnt_idmap_owner - retrieve owner of the mount's idmapping - * @idmap: mount idmapping - * - * This helper will go away once the conversion to use struct mnt_idmap - * everywhere has finished at which point the helper will be unexported. - * - * Only code that needs to perform permission checks based on the owner of the - * idmapping will get access to it. All other code will solely rely on - * idmappings. This will get us type safety so it's impossible to conflate - * filesystems idmappings with mount idmappings. - * - * Return: The owner of the idmapping. - */ -struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap) -{ - return idmap->owner; -} -EXPORT_SYMBOL_GPL(mnt_idmap_owner); - -/** - * mnt_user_ns - retrieve owner of an idmapped mount - * @mnt: the relevant vfsmount - * - * This helper will go away once the conversion to use struct mnt_idmap - * everywhere has finished at which point the helper will be unexported. - * - * Only code that needs to perform permission checks based on the owner of the - * idmapping will get access to it. All other code will solely rely on - * idmappings. This will get us type safety so it's impossible to conflate - * filesystems idmappings with mount idmappings. - * - * Return: The owner of the idmapped. - */ -struct user_namespace *mnt_user_ns(const struct vfsmount *mnt) -{ - struct mnt_idmap *idmap = mnt_idmap(mnt); - - /* Return the actual owner of the filesystem instead of the nop. */ - if (idmap == &nop_mnt_idmap && - !initial_idmapping(mnt->mnt_sb->s_user_ns)) - return mnt->mnt_sb->s_user_ns; - return mnt_idmap_owner(idmap); -} -EXPORT_SYMBOL_GPL(mnt_user_ns); - -/** - * alloc_mnt_idmap - allocate a new idmapping for the mount - * @mnt_userns: owning userns of the idmapping - * - * Allocate a new struct mnt_idmap which carries the idmapping of the mount. - * - * Return: On success a new idmap, on error an error pointer is returned. - */ -static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) -{ - struct mnt_idmap *idmap; - - idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); - if (!idmap) - return ERR_PTR(-ENOMEM); - - idmap->owner = get_user_ns(mnt_userns); - refcount_set(&idmap->count, 1); - return idmap; -} - -/** - * mnt_idmap_get - get a reference to an idmapping - * @idmap: the idmap to bump the reference on - * - * If @idmap is not the @nop_mnt_idmap bump the reference count. - * - * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. - */ -static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) -{ - if (idmap != &nop_mnt_idmap) - refcount_inc(&idmap->count); - - return idmap; -} - -/** - * mnt_idmap_put - put a reference to an idmapping - * @idmap: the idmap to put the reference on - * - * If this is a non-initial idmapping, put the reference count when a mount is - * released and free it if we're the last user. - */ -static inline void mnt_idmap_put(struct mnt_idmap *idmap) -{ - if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { - put_user_ns(idmap->owner); - kfree(idmap); - } -} - static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -1397,6 +1283,17 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); +/* + * Make a mount point inaccessible to new lookups. + * Because there may still be current users, the caller MUST WAIT + * for an RCU grace period before destroying the mount point. + */ +void mnt_make_shortterm(struct vfsmount *mnt) +{ + if (mnt) + real_mount(mnt)->mnt_ns = NULL; +} + /** * path_is_mountpoint() - Check if path is a mount in the current namespace. * @path: path to check @@ -4094,7 +3991,7 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ - if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns) + if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb)) return -EINVAL; /* @@ -4340,7 +4237,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, * result. */ mnt_userns = container_of(ns, struct user_namespace, ns); - if (initial_idmapping(mnt_userns)) { + if (mnt_userns == &init_user_ns) { err = -EPERM; goto out_fput; } @@ -4573,8 +4470,8 @@ EXPORT_SYMBOL_GPL(kern_mount); void kern_unmount(struct vfsmount *mnt) { /* release long term mount so mount point can be released */ - if (!IS_ERR_OR_NULL(mnt)) { - real_mount(mnt)->mnt_ns = NULL; + if (!IS_ERR(mnt)) { + mnt_make_shortterm(mnt); synchronize_rcu(); /* yecchhh... */ mntput(mnt); } @@ -4586,8 +4483,7 @@ void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) unsigned int i; for (i = 0; i < num; i++) - if (mnt[i]) - real_mount(mnt[i])->mnt_ns = NULL; + mnt_make_shortterm(mnt[i]); synchronize_rcu_expedited(); for (i = 0; i < num; i++) mntput(mnt[i]); diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index f684c0cd1ec5..386d6fb92793 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -3,6 +3,7 @@ netfs-y := \ buffered_read.o \ io.o \ + iterator.o \ main.o \ objects.o diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c new file mode 100644 index 000000000000..f00d43b8ac0a --- /dev/null +++ b/fs/netfs/iterator.c @@ -0,0 +1,369 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Iterator helpers. + * + * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/uio.h> +#include <linux/scatterlist.h> +#include <linux/netfs.h> +#include "internal.h" + +/** + * netfs_extract_user_iter - Extract the pages from a user iterator into a bvec + * @orig: The original iterator + * @orig_len: The amount of iterator to copy + * @new: The iterator to be set up + * @extraction_flags: Flags to qualify the request + * + * Extract the page fragments from the given amount of the source iterator and + * build up a second iterator that refers to all of those bits. This allows + * the original iterator to disposed of. + * + * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA be + * allowed on the pages extracted. + * + * On success, the number of elements in the bvec is returned, the original + * iterator will have been advanced by the amount extracted. + * + * The iov_iter_extract_mode() function should be used to query how cleanup + * should be performed. + */ +ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, + struct iov_iter *new, + iov_iter_extraction_t extraction_flags) +{ + struct bio_vec *bv = NULL; + struct page **pages; + unsigned int cur_npages; + unsigned int max_pages; + unsigned int npages = 0; + unsigned int i; + ssize_t ret; + size_t count = orig_len, offset, len; + size_t bv_size, pg_size; + + if (WARN_ON_ONCE(!iter_is_ubuf(orig) && !iter_is_iovec(orig))) + return -EIO; + + max_pages = iov_iter_npages(orig, INT_MAX); + bv_size = array_size(max_pages, sizeof(*bv)); + bv = kvmalloc(bv_size, GFP_KERNEL); + if (!bv) + return -ENOMEM; + + /* Put the page list at the end of the bvec list storage. bvec + * elements are larger than page pointers, so as long as we work + * 0->last, we should be fine. + */ + pg_size = array_size(max_pages, sizeof(*pages)); + pages = (void *)bv + bv_size - pg_size; + + while (count && npages < max_pages) { + ret = iov_iter_extract_pages(orig, &pages, count, + max_pages - npages, extraction_flags, + &offset); + if (ret < 0) { + pr_err("Couldn't get user pages (rc=%zd)\n", ret); + break; + } + + if (ret > count) { + pr_err("get_pages rc=%zd more than %zu\n", ret, count); + break; + } + + count -= ret; + ret += offset; + cur_npages = DIV_ROUND_UP(ret, PAGE_SIZE); + + if (npages + cur_npages > max_pages) { + pr_err("Out of bvec array capacity (%u vs %u)\n", + npages + cur_npages, max_pages); + break; + } + + for (i = 0; i < cur_npages; i++) { + len = ret > PAGE_SIZE ? PAGE_SIZE : ret; + bvec_set_page(bv + npages + i, *pages++, len - offset, offset); + ret -= len; + offset = 0; + } + + npages += cur_npages; + } + + iov_iter_bvec(new, orig->data_source, bv, npages, orig_len - count); + return npages; +} +EXPORT_SYMBOL_GPL(netfs_extract_user_iter); + +/* + * Extract and pin a list of up to sg_max pages from UBUF- or IOVEC-class + * iterators, and add them to the scatterlist. + */ +static ssize_t netfs_extract_user_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + struct page **pages; + unsigned int npages; + ssize_t ret = 0, res; + size_t len, off; + + /* We decant the page list into the tail of the scatterlist */ + pages = (void *)sgtable->sgl + array_size(sg_max, sizeof(struct scatterlist)); + pages -= sg_max; + + do { + res = iov_iter_extract_pages(iter, &pages, maxsize, sg_max, + extraction_flags, &off); + if (res < 0) + goto failed; + + len = res; + maxsize -= len; + ret += len; + npages = DIV_ROUND_UP(off + len, PAGE_SIZE); + sg_max -= npages; + + for (; npages < 0; npages--) { + struct page *page = *pages; + size_t seg = min_t(size_t, PAGE_SIZE - off, len); + + *pages++ = NULL; + sg_set_page(sg, page, len, off); + sgtable->nents++; + sg++; + len -= seg; + off = 0; + } + } while (maxsize > 0 && sg_max > 0); + + return ret; + +failed: + while (sgtable->nents > sgtable->orig_nents) + put_page(sg_page(&sgtable->sgl[--sgtable->nents])); + return res; +} + +/* + * Extract up to sg_max pages from a BVEC-type iterator and add them to the + * scatterlist. The pages are not pinned. + */ +static ssize_t netfs_extract_bvec_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + const struct bio_vec *bv = iter->bvec; + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + size_t off, len; + + len = bv[i].bv_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + off = bv[i].bv_offset + start; + + sg_set_page(sg, bv[i].bv_page, len, off); + sgtable->nents++; + sg++; + sg_max--; + + ret += len; + maxsize -= len; + if (maxsize <= 0 || sg_max == 0) + break; + start = 0; + } + + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/* + * Extract up to sg_max pages from a KVEC-type iterator and add them to the + * scatterlist. This can deal with vmalloc'd buffers as well as kmalloc'd or + * static buffers. The pages are not pinned. + */ +static ssize_t netfs_extract_kvec_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + const struct kvec *kv = iter->kvec; + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + struct page *page; + unsigned long kaddr; + size_t off, len, seg; + + len = kv[i].iov_len; + if (start >= len) { + start -= len; + continue; + } + + kaddr = (unsigned long)kv[i].iov_base + start; + off = kaddr & ~PAGE_MASK; + len = min_t(size_t, maxsize, len - start); + kaddr &= PAGE_MASK; + + maxsize -= len; + ret += len; + do { + seg = min_t(size_t, len, PAGE_SIZE - off); + if (is_vmalloc_or_module_addr((void *)kaddr)) + page = vmalloc_to_page((void *)kaddr); + else + page = virt_to_page(kaddr); + + sg_set_page(sg, page, len, off); + sgtable->nents++; + sg++; + sg_max--; + + len -= seg; + kaddr += PAGE_SIZE; + off = 0; + } while (len > 0 && sg_max > 0); + + if (maxsize <= 0 || sg_max == 0) + break; + start = 0; + } + + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/* + * Extract up to sg_max folios from an XARRAY-type iterator and add them to + * the scatterlist. The pages are not pinned. + */ +static ssize_t netfs_extract_xarray_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + struct xarray *xa = iter->xarray; + struct folio *folio; + loff_t start = iter->xarray_start + iter->iov_offset; + pgoff_t index = start / PAGE_SIZE; + ssize_t ret = 0; + size_t offset, len; + XA_STATE(xas, xa, index); + + rcu_read_lock(); + + xas_for_each(&xas, folio, ULONG_MAX) { + if (xas_retry(&xas, folio)) + continue; + if (WARN_ON(xa_is_value(folio))) + break; + if (WARN_ON(folio_test_hugetlb(folio))) + break; + + offset = offset_in_folio(folio, start); + len = min_t(size_t, maxsize, folio_size(folio) - offset); + + sg_set_page(sg, folio_page(folio, 0), len, offset); + sgtable->nents++; + sg++; + sg_max--; + + maxsize -= len; + ret += len; + if (maxsize <= 0 || sg_max == 0) + break; + } + + rcu_read_unlock(); + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/** + * netfs_extract_iter_to_sg - Extract pages from an iterator and add ot an sglist + * @iter: The iterator to extract from + * @maxsize: The amount of iterator to copy + * @sgtable: The scatterlist table to fill in + * @sg_max: Maximum number of elements in @sgtable that may be filled + * @extraction_flags: Flags to qualify the request + * + * Extract the page fragments from the given amount of the source iterator and + * add them to a scatterlist that refers to all of those bits, to a maximum + * addition of @sg_max elements. + * + * The pages referred to by UBUF- and IOVEC-type iterators are extracted and + * pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE- + * and DISCARD-type are not supported. + * + * No end mark is placed on the scatterlist; that's left to the caller. + * + * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA + * be allowed on the pages extracted. + * + * If successul, @sgtable->nents is updated to include the number of elements + * added and the number of bytes added is returned. @sgtable->orig_nents is + * left unaltered. + * + * The iov_iter_extract_mode() function should be used to query how cleanup + * should be performed. + */ +ssize_t netfs_extract_iter_to_sg(struct iov_iter *iter, size_t maxsize, + struct sg_table *sgtable, unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + if (maxsize == 0) + return 0; + + switch (iov_iter_type(iter)) { + case ITER_UBUF: + case ITER_IOVEC: + return netfs_extract_user_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); + case ITER_BVEC: + return netfs_extract_bvec_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); + case ITER_KVEC: + return netfs_extract_kvec_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); + case ITER_XARRAY: + return netfs_extract_xarray_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); + default: + pr_err("%s(%u) unsupported\n", __func__, iov_iter_type(iter)); + WARN_ON_ONCE(1); + return -EIO; + } +} +EXPORT_SYMBOL_GPL(netfs_extract_iter_to_sg); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 1ead5bd740c2..14a72224b657 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -209,8 +209,8 @@ config NFS_DISABLE_UDP_SUPPORT config NFS_V4_2_READ_PLUS bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation" depends on NFS_V4_2 - default y + default n help - Choose Y here to enable the use of READ_PLUS over NFS v4.2. READ_PLUS - attempts to improve read performance by compressing out sparse holes - in the file contents. + This is intended for developers only. The READ_PLUS operation has + been shown to have issues under specific conditions and should not + be used in production. diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d0cccddb7d08..321af81c456e 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -980,14 +980,11 @@ out_invalidcred: } static int -nfs_callback_dispatch(struct svc_rqst *rqstp, __be32 *statp) +nfs_callback_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *procp = rqstp->rq_procinfo; - svcxdr_init_decode(rqstp); - svcxdr_init_encode(rqstp); - - *statp = procp->pc_func(rqstp); + *rqstp->rq_accept_statp = procp->pc_func(rqstp); return 1; } @@ -1072,7 +1069,8 @@ static const struct svc_procedure nfs4_callback_procedures1[] = { } }; -static unsigned int nfs4_callback_count1[ARRAY_SIZE(nfs4_callback_procedures1)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfs4_callback_count1[ARRAY_SIZE(nfs4_callback_procedures1)]); const struct svc_version nfs4_callback_version1 = { .vs_vers = 1, .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), @@ -1084,7 +1082,8 @@ const struct svc_version nfs4_callback_version1 = { .vs_need_cong_ctrl = true, }; -static unsigned int nfs4_callback_count4[ARRAY_SIZE(nfs4_callback_procedures1)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfs4_callback_count4[ARRAY_SIZE(nfs4_callback_procedures1)]); const struct svc_version nfs4_callback_version4 = { .vs_vers = 4, .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f7e4a88d5d92..a41c3ee4549c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -203,14 +203,14 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie, { struct nfs_cache_array *array; - array = kmap_atomic(page); + array = kmap_local_page(page); array->change_attr = change_attr; array->last_cookie = last_cookie; array->size = 0; array->page_full = 0; array->page_is_eof = 0; array->cookies_are_ordered = 1; - kunmap_atomic(array); + kunmap_local(array); } /* @@ -221,11 +221,11 @@ static void nfs_readdir_clear_array(struct page *page) struct nfs_cache_array *array; unsigned int i; - array = kmap_atomic(page); + array = kmap_local_page(page); for (i = 0; i < array->size; i++) kfree(array->array[i].name); array->size = 0; - kunmap_atomic(array); + kunmap_local(array); } static void nfs_readdir_free_folio(struct folio *folio) @@ -371,14 +371,14 @@ static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie) static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie, u64 change_attr) { - struct nfs_cache_array *array = kmap_atomic(page); + struct nfs_cache_array *array = kmap_local_page(page); int ret = true; if (array->change_attr != change_attr) ret = false; if (nfs_readdir_array_index_cookie(array) != last_cookie) ret = false; - kunmap_atomic(array); + kunmap_local(array); return ret; } @@ -418,9 +418,9 @@ static u64 nfs_readdir_page_last_cookie(struct page *page) struct nfs_cache_array *array; u64 ret; - array = kmap_atomic(page); + array = kmap_local_page(page); ret = array->last_cookie; - kunmap_atomic(array); + kunmap_local(array); return ret; } @@ -429,9 +429,9 @@ static bool nfs_readdir_page_needs_filling(struct page *page) struct nfs_cache_array *array; bool ret; - array = kmap_atomic(page); + array = kmap_local_page(page); ret = !nfs_readdir_array_is_full(array); - kunmap_atomic(array); + kunmap_local(array); return ret; } @@ -439,9 +439,9 @@ static void nfs_readdir_page_set_eof(struct page *page) { struct nfs_cache_array *array; - array = kmap_atomic(page); + array = kmap_local_page(page); nfs_readdir_array_set_eof(array); - kunmap_atomic(array); + kunmap_local(array); } static struct page *nfs_readdir_page_get_next(struct address_space *mapping, @@ -568,14 +568,14 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) struct nfs_cache_array *array; int status; - array = kmap_atomic(desc->page); + array = kmap_local_page(desc->page); if (desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); else status = nfs_readdir_search_for_cookie(array, desc); - kunmap_atomic(array); + kunmap_local(array); return status; } @@ -2296,7 +2296,7 @@ EXPORT_SYMBOL_GPL(nfs_instantiate); * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ -int nfs_create(struct user_namespace *mnt_userns, struct inode *dir, +int nfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct iattr attr; @@ -2325,7 +2325,7 @@ EXPORT_SYMBOL_GPL(nfs_create); * See comments for nfs_proc_create regarding failed operations. */ int -nfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +nfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct iattr attr; @@ -2352,7 +2352,7 @@ EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ -int nfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct iattr attr; @@ -2524,7 +2524,7 @@ EXPORT_SYMBOL_GPL(nfs_unlink); * now have a new file handle and can instantiate an in-core NFS inode * and move the raw page into its mapping. */ -int nfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct page *page; @@ -2642,7 +2642,7 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) * If these conditions are met, we can drop the dentries before doing * the rename. */ -int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -3262,7 +3262,7 @@ static int nfs_execute_ok(struct inode *inode, int mask) return ret; } -int nfs_permission(struct user_namespace *mnt_userns, +int nfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { @@ -3313,7 +3313,7 @@ out_notsup: res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE | NFS_INO_INVALID_OTHER); if (res == 0) - res = generic_permission(&init_user_ns, inode, mask); + res = generic_permission(&nop_mnt_idmap, inode, mask); goto out; } EXPORT_SYMBOL_GPL(nfs_permission); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 1707f46b1335..9a18c5a69ace 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -343,14 +343,12 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); /* XXX do we need to do the eof zeroing found in async_filler? */ - req = nfs_create_request(dreq->ctx, pagevec[i], - pgbase, req_len); + req = nfs_page_create_from_page(dreq->ctx, pagevec[i], + pgbase, pos, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); break; } - req->wb_index = pos >> PAGE_SHIFT; - req->wb_offset = pos & ~PAGE_MASK; if (!nfs_pageio_add_request(&desc, req)) { result = desc.pg_error; nfs_release_request(req); @@ -802,8 +800,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); - req = nfs_create_request(dreq->ctx, pagevec[i], - pgbase, req_len); + req = nfs_page_create_from_page(dreq->ctx, pagevec[i], + pgbase, pos, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); break; @@ -816,8 +814,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, } nfs_lock_request(req); - req->wb_index = pos >> PAGE_SHIFT; - req->wb_offset = pos & ~PAGE_MASK; if (!nfs_pageio_add_request(&desc, req)) { result = desc.pg_error; nfs_unlock_and_release_request(req); diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 01596f2d0a1e..d6a6d1ebb8fd 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -42,7 +42,7 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) dprintk("%s: max fh len %d inode %p parent %p", __func__, *max_len, inode, parent); - if (*max_len < len || IS_AUTOMOUNT(inode)) { + if (*max_len < len) { dprintk("%s: fh len %d too small, required %d\n", __func__, *max_len, len); *max_len = len; @@ -145,17 +145,10 @@ out: return parent; } -static u64 nfs_fetch_iversion(struct inode *inode) -{ - nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); - return inode_peek_iversion_raw(inode); -} - const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, - .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| EXPORT_OP_NOATOMIC_ATTR, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d8ec889a4b3f..893625eacab9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -31,6 +31,7 @@ #include <linux/swap.h> #include <linux/uaccess.h> +#include <linux/filelock.h> #include "delegation.h" #include "internal.h" @@ -276,27 +277,28 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync); * and that the new data won't completely replace the old data in * that range of the file. */ -static bool nfs_full_page_write(struct page *page, loff_t pos, unsigned int len) +static bool nfs_folio_is_full_write(struct folio *folio, loff_t pos, + unsigned int len) { - unsigned int pglen = nfs_page_length(page); - unsigned int offset = pos & (PAGE_SIZE - 1); + unsigned int pglen = nfs_folio_length(folio); + unsigned int offset = offset_in_folio(folio, pos); unsigned int end = offset + len; return !pglen || (end >= pglen && !offset); } -static bool nfs_want_read_modify_write(struct file *file, struct page *page, - loff_t pos, unsigned int len) +static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, + loff_t pos, unsigned int len) { /* * Up-to-date pages, those with ongoing or full-page write * don't need read/modify/write */ - if (PageUptodate(page) || PagePrivate(page) || - nfs_full_page_write(page, pos, len)) + if (folio_test_uptodate(folio) || folio_test_private(folio) || + nfs_folio_is_full_write(folio, pos, len)) return false; - if (pnfs_ld_read_whole_page(file->f_mapping->host)) + if (pnfs_ld_read_whole_page(file_inode(file))) return true; /* Open for reading too? */ if (file->f_mode & FMODE_READ) @@ -304,6 +306,15 @@ static bool nfs_want_read_modify_write(struct file *file, struct page *page, return false; } +static struct folio * +nfs_folio_grab_cache_write_begin(struct address_space *mapping, pgoff_t index) +{ + unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; + + return __filemap_get_folio(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); +} + /* * This does the "real" work of the write. We must allocate and lock the * page to be sent back to the generic routine, which then copies the @@ -313,32 +324,31 @@ static bool nfs_want_read_modify_write(struct file *file, struct page *page, * increment the page use counts until he is done with the page. */ static int nfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct page **pagep, + void **fsdata) { - int ret; - pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; + struct folio *folio; int once_thru = 0; + int ret; dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); start: - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = nfs_folio_grab_cache_write_begin(mapping, pos >> PAGE_SHIFT); + if (!folio) return -ENOMEM; - *pagep = page; + *pagep = &folio->page; - ret = nfs_flush_incompatible(file, page); + ret = nfs_flush_incompatible(file, folio); if (ret) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } else if (!once_thru && - nfs_want_read_modify_write(file, page, pos, len)) { + nfs_want_read_modify_write(file, folio, pos, len)) { once_thru = 1; - ret = nfs_read_folio(file, page_folio(page)); - put_page(page); + ret = nfs_read_folio(file, folio); + folio_put(folio); if (!ret) goto start; } @@ -346,11 +356,12 @@ start: } static int nfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { - unsigned offset = pos & (PAGE_SIZE - 1); struct nfs_open_context *ctx = nfs_file_open_context(file); + struct folio *folio = page_folio(page); + unsigned offset = offset_in_folio(folio, pos); int status; dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", @@ -360,26 +371,26 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, * Zero any uninitialised parts of the page, and then mark the page * as up to date if it turns out that we're extending the file. */ - if (!PageUptodate(page)) { - unsigned pglen = nfs_page_length(page); + if (!folio_test_uptodate(folio)) { + size_t fsize = folio_size(folio); + unsigned pglen = nfs_folio_length(folio); unsigned end = offset + copied; if (pglen == 0) { - zero_user_segments(page, 0, offset, - end, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segments(folio, 0, offset, end, fsize); + folio_mark_uptodate(folio); } else if (end >= pglen) { - zero_user_segment(page, end, PAGE_SIZE); + folio_zero_segment(folio, end, fsize); if (offset == 0) - SetPageUptodate(page); + folio_mark_uptodate(folio); } else - zero_user_segment(page, pglen, PAGE_SIZE); + folio_zero_segment(folio, pglen, fsize); } - status = nfs_updatepage(file, page, offset, copied); + status = nfs_update_folio(file, folio, offset, copied); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (status < 0) return status; @@ -401,14 +412,16 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, static void nfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { + struct inode *inode = folio_file_mapping(folio)->host; dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n", folio->index, offset, length); if (offset != 0 || length < folio_size(folio)) return; /* Cancel any unstarted writes on this page */ - nfs_wb_folio_cancel(folio->mapping->host, folio); + nfs_wb_folio_cancel(inode, folio); folio_wait_fscache(folio); + trace_nfs_invalidate_folio(inode, folio); } /* @@ -422,8 +435,13 @@ static bool nfs_release_folio(struct folio *folio, gfp_t gfp) dfprintk(PAGECACHE, "NFS: release_folio(%p)\n", folio); /* If the private flag is set, then the folio is not freeable */ - if (folio_test_private(folio)) - return false; + if (folio_test_private(folio)) { + if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL || + current_is_kswapd()) + return false; + if (nfs_wb_folio(folio_file_mapping(folio)->host, folio) < 0) + return false; + } return nfs_fscache_release_folio(folio, gfp); } @@ -464,12 +482,15 @@ static void nfs_check_dirty_writeback(struct folio *folio, static int nfs_launder_folio(struct folio *folio) { struct inode *inode = folio->mapping->host; + int ret; dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n", inode->i_ino, folio_pos(folio)); folio_wait_fscache(folio); - return nfs_wb_page(inode, &folio->page); + ret = nfs_wb_folio(inode, folio); + trace_nfs_launder_folio_done(inode, folio, ret); + return ret; } static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -546,22 +567,22 @@ const struct address_space_operations nfs_file_aops = { */ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; struct file *filp = vmf->vma->vm_file; struct inode *inode = file_inode(filp); unsigned pagelen; vm_fault_t ret = VM_FAULT_NOPAGE; struct address_space *mapping; + struct folio *folio = page_folio(vmf->page); dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", - filp, filp->f_mapping->host->i_ino, - (long long)page_offset(page)); + filp, filp->f_mapping->host->i_ino, + (long long)folio_file_pos(folio)); sb_start_pagefault(inode->i_sb); /* make sure the cache has finished storing the page */ - if (PageFsCache(page) && - wait_on_page_fscache_killable(vmf->page) < 0) { + if (folio_test_fscache(folio) && + folio_wait_fscache_killable(folio) < 0) { ret = VM_FAULT_RETRY; goto out; } @@ -570,25 +591,25 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) nfs_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); - lock_page(page); - mapping = page_file_mapping(page); + folio_lock(folio); + mapping = folio_file_mapping(folio); if (mapping != inode->i_mapping) goto out_unlock; - wait_on_page_writeback(page); + folio_wait_writeback(folio); - pagelen = nfs_page_length(page); + pagelen = nfs_folio_length(folio); if (pagelen == 0) goto out_unlock; ret = VM_FAULT_LOCKED; - if (nfs_flush_incompatible(filp, page) == 0 && - nfs_updatepage(filp, page, 0, pagelen) == 0) + if (nfs_flush_incompatible(filp, folio) == 0 && + nfs_update_folio(filp, folio, 0, pagelen) == 0) goto out; ret = VM_FAULT_SIGBUS; out_unlock: - unlock_page(page); + folio_unlock(folio); out: sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 4974cd18ca46..ce8f8934bca5 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -862,6 +862,8 @@ fl_pnfs_update_layout(struct inode *ino, status = filelayout_check_deviceid(lo, fl, gfp_flags); if (status) { + pnfs_error_mark_layout_for_return(ino, lseg); + pnfs_set_lo_fail(lseg); pnfs_put_lseg(lseg); lseg = NULL; } diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index e731c00a9fcb..ea5f2976dfab 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -245,14 +245,12 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) struct netfs_cache_resources cres; struct fscache_cookie *cookie = nfs_i_fscache(inode); struct iov_iter iter; - struct bio_vec bvec[1]; + struct bio_vec bvec; int ret; memset(&cres, 0, sizeof(cres)); - bvec[0].bv_page = page; - bvec[0].bv_offset = 0; - bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + bvec_set_page(&bvec, page, PAGE_SIZE, 0); + iov_iter_bvec(&iter, ITER_DEST, &bvec, 1, PAGE_SIZE); ret = fscache_begin_read_operation(&cres, cookie); if (ret < 0) @@ -273,16 +271,14 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, struct netfs_cache_resources cres; struct fscache_cookie *cookie = nfs_i_fscache(inode); struct iov_iter iter; - struct bio_vec bvec[1]; + struct bio_vec bvec; loff_t start = page_offset(page); size_t len = PAGE_SIZE; int ret; memset(&cres, 0, sizeof(cres)); - bvec[0].bv_page = page; - bvec[0].bv_offset = 0; - bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + bvec_set_page(&bvec, page, PAGE_SIZE, 0); + iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); ret = fscache_begin_write_operation(&cres, cookie); if (ret < 0) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e98ee7599eeb..222a28320e1c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -606,7 +606,7 @@ EXPORT_SYMBOL_GPL(nfs_fhget); #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) int -nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -825,10 +825,12 @@ static u32 nfs_get_valid_attrmask(struct inode *inode) reply_mask |= STATX_UID | STATX_GID; if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) reply_mask |= STATX_BLOCKS; + if (!(cache_validity & NFS_INO_INVALID_CHANGE)) + reply_mask |= STATX_CHANGE_COOKIE; return reply_mask; } -int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -843,7 +845,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | - STATX_INO | STATX_SIZE | STATX_BLOCKS; + STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME | + STATX_CHANGE_COOKIE; if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { if (readdirplus_enabled) @@ -851,8 +854,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, goto out_no_revalidate; } - /* Flush out writes to the server in order to update c/mtime. */ - if ((request_mask & (STATX_CTIME | STATX_MTIME)) && + /* Flush out writes to the server in order to update c/mtime/version. */ + if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) && S_ISREG(inode->i_mode)) filemap_write_and_wait(inode->i_mapping); @@ -872,7 +875,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, /* Is the user requesting attributes that might need revalidation? */ if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| STATX_MTIME|STATX_UID|STATX_GID| - STATX_SIZE|STATX_BLOCKS))) + STATX_SIZE|STATX_BLOCKS| + STATX_CHANGE_COOKIE))) goto out_no_revalidate; /* Check whether the cached attributes are stale */ @@ -908,8 +912,12 @@ out_no_revalidate: /* Only return attributes that were revalidated. */ stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + stat->change_cookie = inode_peek_iversion_raw(inode); + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; + if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED) + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; out: diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ae7d4a8c728c..2a65fe2a63ab 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -384,18 +384,18 @@ extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); void nfs_d_prune_case_insensitive_aliases(struct inode *inode); -int nfs_create(struct user_namespace *, struct inode *, struct dentry *, +int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); -int nfs_mkdir(struct user_namespace *, struct inode *, struct dentry *, +int nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); -int nfs_symlink(struct user_namespace *, struct inode *, struct dentry *, +int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int nfs_link(struct dentry *, struct inode *, struct dentry *); -int nfs_mknod(struct user_namespace *, struct inode *, struct dentry *, umode_t, +int nfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); -int nfs_rename(struct user_namespace *, struct inode *, struct dentry *, +int nfs_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); #ifdef CONFIG_NFS_V4_2 @@ -760,17 +760,18 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) * Record the page as unstable (an extra writeback period) and mark its * inode as dirty. */ -static inline -void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) +static inline void nfs_folio_mark_unstable(struct folio *folio, + struct nfs_commit_info *cinfo) { - if (!cinfo->dreq) { - struct inode *inode = page_file_mapping(page)->host; + if (folio && !cinfo->dreq) { + struct inode *inode = folio_file_mapping(folio)->host; + long nr = folio_nr_pages(folio); /* This page is really still in write-back - just that the * writeback is happening on the server now. */ - inc_node_page_state(page, NR_WRITEBACK); - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); + node_stat_mod_folio(folio, NR_WRITEBACK, nr); + wb_stat_mod(&inode_to_bdi(inode)->wb, WB_WRITEBACK, nr); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } @@ -795,6 +796,24 @@ unsigned int nfs_page_length(struct page *page) } /* + * Determine the number of bytes of data the page contains + */ +static inline size_t nfs_folio_length(struct folio *folio) +{ + loff_t i_size = i_size_read(folio_file_mapping(folio)->host); + + if (i_size > 0) { + pgoff_t index = folio_index(folio) >> folio_order(folio); + pgoff_t end_index = (i_size - 1) >> folio_shift(folio); + if (index < end_index) + return folio_size(folio); + if (index == end_index) + return offset_in_folio(folio, i_size - 1) + 1; + } + return 0; +} + +/* * Convert a umode to a dirent->d_type */ static inline @@ -807,11 +826,10 @@ unsigned char nfs_umode_to_dtype(umode_t mode) * Determine the number of pages in an array of length 'len' and * with a base offset of 'base' */ -static inline -unsigned int nfs_page_array_len(unsigned int base, size_t len) +static inline unsigned int nfs_page_array_len(unsigned int base, size_t len) { - return ((unsigned long)len + (unsigned long)base + - PAGE_SIZE - 1) >> PAGE_SHIFT; + return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> + PAGE_SHIFT; } /* diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index b0ef7e7ddb30..19d51ebf842c 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -208,23 +208,23 @@ out_fc: } static int -nfs_namespace_getattr(struct user_namespace *mnt_userns, +nfs_namespace_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { if (NFS_FH(d_inode(path->dentry))->size != 0) - return nfs_getattr(mnt_userns, path, stat, request_mask, + return nfs_getattr(idmap, path, stat, request_mask, query_flags); - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); return 0; } static int -nfs_namespace_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +nfs_namespace_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { if (NFS_FH(d_inode(dentry))->size != 0) - return nfs_setattr(mnt_userns, dentry, attr); + return nfs_setattr(idmap, dentry, attr); return -EACCES; } diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index df9ca56db347..4fa37dc038b5 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -12,7 +12,7 @@ */ #ifdef CONFIG_NFS_V3_ACL extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu); -extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int nfs3_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 74d11e3c4205..1247f544a440 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -255,7 +255,7 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, } -int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int nfs3_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct posix_acl *orig = acl, *dfacl = NULL, *alloc; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index ecb428512fe1..93e306bf4430 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -460,7 +460,8 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, if (err >= 0) break; - if (err == -ENOTSUPP && + if ((err == -ENOTSUPP || + err == -NFS4ERR_OFFLOAD_DENIED) && nfs42_files_from_same_server(src, dst)) { err = -EOPNOTSUPP; break; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 5edd1704f735..4c9f8bd866ab 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -23,6 +23,7 @@ #define NFS4_MAX_LOOP_ON_RECOVER (10) #include <linux/seqlock.h> +#include <linux/filelock.h> struct idmap; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 40d749f29ed3..22a93ae46cd7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7692,7 +7692,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7716,7 +7716,7 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) #define XATTR_NAME_NFSV4_DACL "system.nfs4_dacl" static int nfs4_xattr_set_nfs4_dacl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7739,7 +7739,7 @@ static bool nfs4_xattr_list_nfs4_dacl(struct dentry *dentry) #define XATTR_NAME_NFSV4_SACL "system.nfs4_sacl" static int nfs4_xattr_set_nfs4_sacl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7764,7 +7764,7 @@ static bool nfs4_xattr_list_nfs4_sacl(struct dentry *dentry) #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7815,7 +7815,7 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) #ifdef CONFIG_NFS_V4_2 static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -10604,7 +10604,9 @@ static void nfs4_disable_swap(struct inode *inode) /* The state manager thread will now exit once it is * woken. */ - wake_up_var(&NFS_SERVER(inode)->nfs_client->cl_state); + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + + nfs4_schedule_state_manager(clp); } static const struct inode_operations nfs4_dir_inode_operations = { diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 214bc56f92d2..d27919d7241d 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -292,32 +292,34 @@ TRACE_DEFINE_ENUM(NFS4CLNT_MOVED); TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED); TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED); TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER); +TRACE_DEFINE_ENUM(NFS4CLNT_MANAGER_AVAILABLE); TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_RUNNING); TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_READ); TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_RW); +TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_DELAYED); #define show_nfs4_clp_state(state) \ __print_flags(state, "|", \ - { NFS4CLNT_MANAGER_RUNNING, "MANAGER_RUNNING" }, \ - { NFS4CLNT_CHECK_LEASE, "CHECK_LEASE" }, \ - { NFS4CLNT_LEASE_EXPIRED, "LEASE_EXPIRED" }, \ - { NFS4CLNT_RECLAIM_REBOOT, "RECLAIM_REBOOT" }, \ - { NFS4CLNT_RECLAIM_NOGRACE, "RECLAIM_NOGRACE" }, \ - { NFS4CLNT_DELEGRETURN, "DELEGRETURN" }, \ - { NFS4CLNT_SESSION_RESET, "SESSION_RESET" }, \ - { NFS4CLNT_LEASE_CONFIRM, "LEASE_CONFIRM" }, \ - { NFS4CLNT_SERVER_SCOPE_MISMATCH, \ - "SERVER_SCOPE_MISMATCH" }, \ - { NFS4CLNT_PURGE_STATE, "PURGE_STATE" }, \ - { NFS4CLNT_BIND_CONN_TO_SESSION, \ - "BIND_CONN_TO_SESSION" }, \ - { NFS4CLNT_MOVED, "MOVED" }, \ - { NFS4CLNT_LEASE_MOVED, "LEASE_MOVED" }, \ - { NFS4CLNT_DELEGATION_EXPIRED, "DELEGATION_EXPIRED" }, \ - { NFS4CLNT_RUN_MANAGER, "RUN_MANAGER" }, \ - { NFS4CLNT_RECALL_RUNNING, "RECALL_RUNNING" }, \ - { NFS4CLNT_RECALL_ANY_LAYOUT_READ, "RECALL_ANY_LAYOUT_READ" }, \ - { NFS4CLNT_RECALL_ANY_LAYOUT_RW, "RECALL_ANY_LAYOUT_RW" }) + { BIT(NFS4CLNT_MANAGER_RUNNING), "MANAGER_RUNNING" }, \ + { BIT(NFS4CLNT_CHECK_LEASE), "CHECK_LEASE" }, \ + { BIT(NFS4CLNT_LEASE_EXPIRED), "LEASE_EXPIRED" }, \ + { BIT(NFS4CLNT_RECLAIM_REBOOT), "RECLAIM_REBOOT" }, \ + { BIT(NFS4CLNT_RECLAIM_NOGRACE), "RECLAIM_NOGRACE" }, \ + { BIT(NFS4CLNT_DELEGRETURN), "DELEGRETURN" }, \ + { BIT(NFS4CLNT_SESSION_RESET), "SESSION_RESET" }, \ + { BIT(NFS4CLNT_LEASE_CONFIRM), "LEASE_CONFIRM" }, \ + { BIT(NFS4CLNT_SERVER_SCOPE_MISMATCH), "SERVER_SCOPE_MISMATCH" }, \ + { BIT(NFS4CLNT_PURGE_STATE), "PURGE_STATE" }, \ + { BIT(NFS4CLNT_BIND_CONN_TO_SESSION), "BIND_CONN_TO_SESSION" }, \ + { BIT(NFS4CLNT_MOVED), "MOVED" }, \ + { BIT(NFS4CLNT_LEASE_MOVED), "LEASE_MOVED" }, \ + { BIT(NFS4CLNT_DELEGATION_EXPIRED), "DELEGATION_EXPIRED" }, \ + { BIT(NFS4CLNT_RUN_MANAGER), "RUN_MANAGER" }, \ + { BIT(NFS4CLNT_MANAGER_AVAILABLE), "MANAGER_AVAILABLE" }, \ + { BIT(NFS4CLNT_RECALL_RUNNING), "RECALL_RUNNING" }, \ + { BIT(NFS4CLNT_RECALL_ANY_LAYOUT_READ), "RECALL_ANY_LAYOUT_READ" }, \ + { BIT(NFS4CLNT_RECALL_ANY_LAYOUT_RW), "RECALL_ANY_LAYOUT_RW" }, \ + { BIT(NFS4CLNT_DELEGRETURN_DELAYED), "DELERETURN_DELAYED" }) TRACE_EVENT(nfs4_state_mgr, TP_PROTO( diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 642f6921852f..a778713343df 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -152,8 +152,6 @@ DEFINE_NFS_INODE_EVENT(nfs_getattr_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit); DEFINE_NFS_INODE_EVENT(nfs_setattr_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit); -DEFINE_NFS_INODE_EVENT(nfs_writeback_page_enter); -DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_page_exit); DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit); DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); @@ -933,13 +931,13 @@ TRACE_EVENT(nfs_sillyrename_unlink, ) ); -TRACE_EVENT(nfs_aop_readpage, +DECLARE_EVENT_CLASS(nfs_folio_event, TP_PROTO( const struct inode *inode, - struct page *page + struct folio *folio ), - TP_ARGS(inode, page), + TP_ARGS(inode, folio), TP_STRUCT__entry( __field(dev_t, dev) @@ -947,6 +945,7 @@ TRACE_EVENT(nfs_aop_readpage, __field(u64, fileid) __field(u64, version) __field(loff_t, offset) + __field(u32, count) ), TP_fast_assign( @@ -956,26 +955,36 @@ TRACE_EVENT(nfs_aop_readpage, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = page_index(page) << PAGE_SHIFT; + __entry->offset = folio_file_pos(folio); + __entry->count = nfs_folio_length(folio); ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld", + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " + "offset=%lld count=%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, - __entry->offset + __entry->offset, __entry->count ) ); -TRACE_EVENT(nfs_aop_readpage_done, +#define DEFINE_NFS_FOLIO_EVENT(name) \ + DEFINE_EVENT(nfs_folio_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + struct folio *folio \ + ), \ + TP_ARGS(inode, folio)) + +DECLARE_EVENT_CLASS(nfs_folio_event_done, TP_PROTO( const struct inode *inode, - struct page *page, + struct folio *folio, int ret ), - TP_ARGS(inode, page, ret), + TP_ARGS(inode, folio, ret), TP_STRUCT__entry( __field(dev_t, dev) @@ -984,6 +993,7 @@ TRACE_EVENT(nfs_aop_readpage_done, __field(u64, fileid) __field(u64, version) __field(loff_t, offset) + __field(u32, count) ), TP_fast_assign( @@ -993,19 +1003,39 @@ TRACE_EVENT(nfs_aop_readpage_done, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = page_index(page) << PAGE_SHIFT; + __entry->offset = folio_file_pos(folio); + __entry->count = nfs_folio_length(folio); __entry->ret = ret; ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld ret=%d", + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " + "offset=%lld count=%u ret=%d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, - __entry->offset, __entry->ret + __entry->offset, __entry->count, __entry->ret ) ); +#define DEFINE_NFS_FOLIO_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_folio_event_done, name, \ + TP_PROTO( \ + const struct inode *inode, \ + struct folio *folio, \ + int ret \ + ), \ + TP_ARGS(inode, folio, ret)) + +DEFINE_NFS_FOLIO_EVENT(nfs_aop_readpage); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_aop_readpage_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_writeback_folio); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done); + TRACE_EVENT(nfs_aop_readahead, TP_PROTO( const struct inode *inode, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 16be6dae524f..64fa8de199de 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -21,6 +21,7 @@ #include <linux/nfs_page.h> #include <linux/nfs_mount.h> #include <linux/export.h> +#include <linux/filelock.h> #include "internal.h" #include "pnfs.h" @@ -31,6 +32,42 @@ static struct kmem_cache *nfs_page_cachep; static const struct rpc_call_ops nfs_pgio_common_ops; +struct nfs_page_iter_page { + const struct nfs_page *req; + size_t count; +}; + +static void nfs_page_iter_page_init(struct nfs_page_iter_page *i, + const struct nfs_page *req) +{ + i->req = req; + i->count = 0; +} + +static void nfs_page_iter_page_advance(struct nfs_page_iter_page *i, size_t sz) +{ + const struct nfs_page *req = i->req; + size_t tmp = i->count + sz; + + i->count = (tmp < req->wb_bytes) ? tmp : req->wb_bytes; +} + +static struct page *nfs_page_iter_page_get(struct nfs_page_iter_page *i) +{ + const struct nfs_page *req = i->req; + struct page *page; + + if (i->count != req->wb_bytes) { + size_t base = i->count + req->wb_pgbase; + size_t len = PAGE_SIZE - offset_in_page(base); + + page = nfs_page_to_page(req, base); + nfs_page_iter_page_advance(i, len); + return page; + } + return NULL; +} + static struct nfs_pgio_mirror * nfs_pgio_get_mirror(struct nfs_pageio_descriptor *desc, u32 idx) { @@ -390,7 +427,7 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) * has extra ref from the write/commit path to handle handoff * between write and commit lists. */ if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { - inode = page_file_mapping(req->wb_page)->host; + inode = nfs_page_to_inode(req); set_bit(PG_INODE_REF, &req->wb_flags); kref_get(&req->wb_kref); atomic_long_inc(&NFS_I(inode)->nrequests); @@ -430,10 +467,9 @@ out: nfs_release_request(head); } -static struct nfs_page * -__nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page, - unsigned int pgbase, unsigned int offset, - unsigned int count) +static struct nfs_page *nfs_page_create(struct nfs_lock_context *l_ctx, + unsigned int pgbase, pgoff_t index, + unsigned int offset, unsigned int count) { struct nfs_page *req; struct nfs_open_context *ctx = l_ctx->open_context; @@ -452,42 +488,90 @@ __nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page, /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ - req->wb_page = page; - if (page) { - req->wb_index = page_index(page); - get_page(page); - } - req->wb_offset = offset; - req->wb_pgbase = pgbase; - req->wb_bytes = count; + req->wb_pgbase = pgbase; + req->wb_index = index; + req->wb_offset = offset; + req->wb_bytes = count; kref_init(&req->wb_kref); req->wb_nio = 0; return req; } +static void nfs_page_assign_folio(struct nfs_page *req, struct folio *folio) +{ + if (folio != NULL) { + req->wb_folio = folio; + folio_get(folio); + set_bit(PG_FOLIO, &req->wb_flags); + } +} + +static void nfs_page_assign_page(struct nfs_page *req, struct page *page) +{ + if (page != NULL) { + req->wb_page = page; + get_page(page); + } +} + /** - * nfs_create_request - Create an NFS read/write request. + * nfs_page_create_from_page - Create an NFS read/write request. * @ctx: open context to use * @page: page to write - * @offset: starting offset within the page for the write + * @pgbase: starting offset within the page for the write + * @offset: file offset for the write * @count: number of bytes to read/write * * The page must be locked by the caller. This makes sure we never * create two different requests for the same page. * User should ensure it is safe to sleep in this function. */ -struct nfs_page * -nfs_create_request(struct nfs_open_context *ctx, struct page *page, - unsigned int offset, unsigned int count) +struct nfs_page *nfs_page_create_from_page(struct nfs_open_context *ctx, + struct page *page, + unsigned int pgbase, loff_t offset, + unsigned int count) +{ + struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx); + struct nfs_page *ret; + + if (IS_ERR(l_ctx)) + return ERR_CAST(l_ctx); + ret = nfs_page_create(l_ctx, pgbase, offset >> PAGE_SHIFT, + offset_in_page(offset), count); + if (!IS_ERR(ret)) { + nfs_page_assign_page(ret, page); + nfs_page_group_init(ret, NULL); + } + nfs_put_lock_context(l_ctx); + return ret; +} + +/** + * nfs_page_create_from_folio - Create an NFS read/write request. + * @ctx: open context to use + * @folio: folio to write + * @offset: starting offset within the folio for the write + * @count: number of bytes to read/write + * + * The page must be locked by the caller. This makes sure we never + * create two different requests for the same page. + * User should ensure it is safe to sleep in this function. + */ +struct nfs_page *nfs_page_create_from_folio(struct nfs_open_context *ctx, + struct folio *folio, + unsigned int offset, + unsigned int count) { struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx); struct nfs_page *ret; if (IS_ERR(l_ctx)) return ERR_CAST(l_ctx); - ret = __nfs_create_request(l_ctx, page, offset, offset, count); - if (!IS_ERR(ret)) + ret = nfs_page_create(l_ctx, offset, folio_index(folio), offset, count); + if (!IS_ERR(ret)) { + nfs_page_assign_folio(ret, folio); nfs_page_group_init(ret, NULL); + } nfs_put_lock_context(l_ctx); return ret; } @@ -500,10 +584,16 @@ nfs_create_subreq(struct nfs_page *req, { struct nfs_page *last; struct nfs_page *ret; + struct folio *folio = nfs_page_to_folio(req); + struct page *page = nfs_page_to_page(req, pgbase); - ret = __nfs_create_request(req->wb_lock_context, req->wb_page, - pgbase, offset, count); + ret = nfs_page_create(req->wb_lock_context, pgbase, req->wb_index, + offset, count); if (!IS_ERR(ret)) { + if (folio) + nfs_page_assign_folio(ret, folio); + else + nfs_page_assign_page(ret, page); /* find the last request */ for (last = req->wb_head; last->wb_this_page != req->wb_head; @@ -511,7 +601,6 @@ nfs_create_subreq(struct nfs_page *req, ; nfs_lock_request(ret); - ret->wb_index = req->wb_index; nfs_page_group_init(ret, last); ret->wb_nio = req->wb_nio; } @@ -550,11 +639,16 @@ void nfs_unlock_and_release_request(struct nfs_page *req) */ static void nfs_clear_request(struct nfs_page *req) { + struct folio *folio = nfs_page_to_folio(req); struct page *page = req->wb_page; struct nfs_lock_context *l_ctx = req->wb_lock_context; struct nfs_open_context *ctx; - if (page != NULL) { + if (folio != NULL) { + folio_put(folio); + req->wb_folio = NULL; + clear_bit(PG_FOLIO, &req->wb_flags); + } else if (page != NULL) { put_page(page); req->wb_page = NULL; } @@ -692,13 +786,14 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); /** * nfs_pgio_rpcsetup - Set up arguments for a pageio call * @hdr: The pageio hdr + * @pgbase: base * @count: Number of bytes to read * @how: How to commit data (writes only) * @cinfo: Commit information for the call (writes only) */ -static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, - unsigned int count, - int how, struct nfs_commit_info *cinfo) +static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, unsigned int pgbase, + unsigned int count, int how, + struct nfs_commit_info *cinfo) { struct nfs_page *req = hdr->req; @@ -709,7 +804,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, hdr->args.offset = req_offset(req); /* pnfs_set_layoutcommit needs this */ hdr->mds_offset = hdr->args.offset; - hdr->args.pgbase = req->wb_pgbase; + hdr->args.pgbase = pgbase; hdr->args.pages = hdr->page_array.pagevec; hdr->args.count = count; hdr->args.context = get_nfs_open_context(nfs_req_openctx(req)); @@ -895,9 +990,10 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, struct nfs_commit_info cinfo; struct nfs_page_array *pg_array = &hdr->page_array; unsigned int pagecount, pageused; + unsigned int pg_base = offset_in_page(mirror->pg_base); gfp_t gfp_flags = nfs_io_gfp_mask(); - pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); + pagecount = nfs_page_array_len(pg_base, mirror->pg_count); pg_array->npages = pagecount; if (pagecount <= ARRAY_SIZE(pg_array->page_array)) @@ -917,16 +1013,26 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, last_page = NULL; pageused = 0; while (!list_empty(head)) { + struct nfs_page_iter_page i; + struct page *page; + req = nfs_list_entry(head->next); nfs_list_move_request(req, &hdr->pages); - if (!last_page || last_page != req->wb_page) { - pageused++; - if (pageused > pagecount) - break; - *pages++ = last_page = req->wb_page; + if (req->wb_pgbase == 0) + last_page = NULL; + + nfs_page_iter_page_init(&i, req); + while ((page = nfs_page_iter_page_get(&i)) != NULL) { + if (last_page != page) { + pageused++; + if (pageused > pagecount) + goto full; + *pages++ = last_page = page; + } } } +full: if (WARN_ON_ONCE(pageused != pagecount)) { nfs_pgio_error(hdr); desc->pg_error = -EINVAL; @@ -938,7 +1044,8 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - nfs_pgio_rpcsetup(hdr, mirror->pg_count, desc->pg_ioflags, &cinfo); + nfs_pgio_rpcsetup(hdr, pg_base, mirror->pg_count, desc->pg_ioflags, + &cinfo); desc->pg_rpc_callops = &nfs_pgio_common_ops; return 0; } @@ -1034,6 +1141,24 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1, return l1->lockowner == l2->lockowner; } +static bool nfs_page_is_contiguous(const struct nfs_page *prev, + const struct nfs_page *req) +{ + size_t prev_end = prev->wb_pgbase + prev->wb_bytes; + + if (req_offset(req) != req_offset(prev) + prev->wb_bytes) + return false; + if (req->wb_pgbase == 0) + return prev_end == nfs_page_max_length(prev); + if (req->wb_pgbase == prev_end) { + struct folio *folio = nfs_page_to_folio(req); + if (folio) + return folio == nfs_page_to_folio(prev); + return req->wb_page == prev->wb_page; + } + return false; +} + /** * nfs_coalesce_size - test two requests for compatibility * @prev: pointer to nfs_page @@ -1062,16 +1187,8 @@ static unsigned int nfs_coalesce_size(struct nfs_page *prev, !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context)) return 0; - if (req_offset(req) != req_offset(prev) + prev->wb_bytes) + if (!nfs_page_is_contiguous(prev, req)) return 0; - if (req->wb_page == prev->wb_page) { - if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes) - return 0; - } else { - if (req->wb_pgbase != 0 || - prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE) - return 0; - } } return pgio->pg_ops->pg_test(pgio, prev, req); } @@ -1411,16 +1528,21 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) { struct nfs_pgio_mirror *mirror; struct nfs_page *prev; + struct folio *folio; u32 midx; for (midx = 0; midx < desc->pg_mirror_count; midx++) { mirror = nfs_pgio_get_mirror(desc, midx); if (!list_empty(&mirror->pg_list)) { prev = nfs_list_entry(mirror->pg_list.prev); - if (index != prev->wb_index + 1) { - nfs_pageio_complete(desc); - break; - } + folio = nfs_page_to_folio(prev); + if (folio) { + if (index == folio_next_index(folio)) + continue; + } else if (index == prev->wb_index + 1) + continue; + nfs_pageio_complete(desc); + break; } } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a5db5158c634..306cba0b9e69 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -511,7 +511,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) spin_lock(&inode->i_lock); pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); - pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0); + pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index e3e6a41f19de..d886c8226d8f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -193,7 +193,7 @@ struct pnfs_commit_ops { void (*recover_commit_reqs) (struct list_head *list, struct nfs_commit_info *cinfo); struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, - struct page *page); + struct folio *folio); }; struct pnfs_layout_hdr { @@ -395,7 +395,7 @@ void pnfs_generic_rw_release(void *data); void pnfs_generic_recover_commit_reqs(struct list_head *dst, struct nfs_commit_info *cinfo); struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, - struct page *page); + struct folio *folio); int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int how, @@ -557,13 +557,13 @@ pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) static inline struct nfs_page * pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, - struct page *page) + struct folio *folio) { struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs) return NULL; - return fl_cinfo->ops->search_commit_reqs(cinfo, page); + return fl_cinfo->ops->search_commit_reqs(cinfo, folio); } /* Should the pNFS client commit and return the layout upon a setattr */ @@ -864,7 +864,7 @@ pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) static inline struct nfs_page * pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, - struct page *page) + struct folio *folio) { return NULL; } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 5d035dd2d7bf..a0112ad4937a 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -353,7 +353,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); static struct nfs_page * pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, - unsigned int nbuckets, struct page *page) + unsigned int nbuckets, struct folio *folio) { struct nfs_page *req; struct pnfs_commit_bucket *b; @@ -363,11 +363,11 @@ pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, * request is found */ for (i = 0, b = buckets; i < nbuckets; i++, b++) { list_for_each_entry(req, &b->written, wb_list) { - if (req->wb_page == page) + if (nfs_page_to_folio(req) == folio) return req->wb_head; } list_for_each_entry(req, &b->committing, wb_list) { - if (req->wb_page == page) + if (nfs_page_to_folio(req) == folio) return req->wb_head; } } @@ -375,14 +375,14 @@ pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, } /* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request - * for @page + * for @folio * @cinfo - commit info for current inode - * @page - page to search for matching head request + * @folio - page to search for matching head request * * Return: the head request if one is found, otherwise %NULL. */ -struct nfs_page * -pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) +struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, + struct folio *folio) { struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; struct pnfs_commit_array *array; @@ -390,7 +390,7 @@ pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) { req = pnfs_bucket_search_commit_reqs(array->buckets, - array->nbuckets, page); + array->nbuckets, folio); if (req) return req; } @@ -1180,7 +1180,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, nfs_request_add_commit_list_locked(req, list, cinfo); mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - nfs_mark_page_unstable(req->wb_page, cinfo); + nfs_folio_mark_unstable(nfs_page_to_folio(req), cinfo); return; out_resched: mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 8ae2c8d1219d..c380cff4108e 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -49,12 +49,11 @@ static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) kmem_cache_free(nfs_rdata_cachep, rhdr); } -static -int nfs_return_empty_page(struct page *page) +static int nfs_return_empty_folio(struct folio *folio) { - zero_user(page, 0, PAGE_SIZE); - SetPageUptodate(page); - unlock_page(page); + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; } @@ -111,18 +110,18 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); static void nfs_readpage_release(struct nfs_page *req, int error) { struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); - struct page *page = req->wb_page; + struct folio *folio = nfs_page_to_folio(req); dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), req->wb_bytes, (long long)req_offset(req)); if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT) - SetPageError(page); + folio_set_error(folio); if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - if (PageUptodate(page)) - nfs_fscache_write_page(inode, page); - unlock_page(page); + if (folio_test_uptodate(folio)) + nfs_fscache_write_page(inode, &folio->page); + folio_unlock(folio); } nfs_release_request(req); } @@ -135,7 +134,7 @@ struct nfs_readdesc { static void nfs_page_group_set_uptodate(struct nfs_page *req) { if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) - SetPageUptodate(req->wb_page); + folio_mark_uptodate(nfs_page_to_folio(req)); } static void nfs_read_completion(struct nfs_pgio_header *hdr) @@ -147,7 +146,7 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) goto out; while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); - struct page *page = req->wb_page; + struct folio *folio = nfs_page_to_folio(req); unsigned long start = req->wb_pgbase; unsigned long end = req->wb_pgbase + req->wb_bytes; @@ -157,14 +156,14 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) if (bytes > hdr->good_bytes) { /* nothing in this request was good, so zero * the full extent of the request */ - zero_user_segment(page, start, end); + folio_zero_segment(folio, start, end); } else if (hdr->good_bytes - bytes < req->wb_bytes) { /* part of this request has good bytes, but * not all. zero the bad bytes */ start += hdr->good_bytes - bytes; WARN_ON(start < req->wb_pgbase); - zero_user_segment(page, start, end); + folio_zero_segment(folio, start, end); } } error = 0; @@ -281,33 +280,34 @@ static void nfs_readpage_result(struct rpc_task *task, nfs_readpage_retry(task, hdr); } -static int -readpage_async_filler(struct nfs_readdesc *desc, struct page *page) +static int readpage_async_filler(struct nfs_readdesc *desc, struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; - unsigned int rsize = NFS_SERVER(inode)->rsize; + struct inode *inode = folio_file_mapping(folio)->host; + struct nfs_server *server = NFS_SERVER(inode); + size_t fsize = folio_size(folio); + unsigned int rsize = server->rsize; struct nfs_page *new; unsigned int len, aligned_len; int error; - len = nfs_page_length(page); + len = nfs_folio_length(folio); if (len == 0) - return nfs_return_empty_page(page); + return nfs_return_empty_folio(folio); - aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE); + aligned_len = min_t(unsigned int, ALIGN(len, rsize), fsize); - if (!IS_SYNC(page->mapping->host)) { - error = nfs_fscache_read_page(page->mapping->host, page); + if (!IS_SYNC(inode)) { + error = nfs_fscache_read_page(inode, &folio->page); if (error == 0) goto out_unlock; } - new = nfs_create_request(desc->ctx, page, 0, aligned_len); + new = nfs_page_create_from_folio(desc->ctx, folio, 0, aligned_len); if (IS_ERR(new)) goto out_error; - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); + if (len < fsize) + folio_zero_segment(folio, len, fsize); if (!nfs_pageio_add_request(&desc->pgio, new)) { nfs_list_remove_request(new); error = desc->pgio.pg_error; @@ -318,7 +318,7 @@ readpage_async_filler(struct nfs_readdesc *desc, struct page *page) out_error: error = PTR_ERR(new); out_unlock: - unlock_page(page); + folio_unlock(folio); out: return error; } @@ -331,61 +331,54 @@ out: */ int nfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; struct nfs_readdesc desc; - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = file_inode(file); int ret; - trace_nfs_aop_readpage(inode, page); + trace_nfs_aop_readpage(inode, folio); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); /* * Try to flush any pending writes to the file.. * - * NOTE! Because we own the page lock, there cannot + * NOTE! Because we own the folio lock, there cannot * be any new pending writes generated at this point - * for this page (other pages can be written to). + * for this folio (other folios can be written to). */ - ret = nfs_wb_page(inode, page); + ret = nfs_wb_folio(inode, folio); if (ret) goto out_unlock; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out_unlock; ret = -ESTALE; if (NFS_STALE(inode)) goto out_unlock; - if (file == NULL) { - ret = -EBADF; - desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); - if (desc.ctx == NULL) - goto out_unlock; - } else - desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); + desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); xchg(&desc.ctx->error, 0); nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); - ret = readpage_async_filler(&desc, page); + ret = readpage_async_filler(&desc, folio); if (ret) goto out; nfs_pageio_complete_read(&desc.pgio); ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; if (!ret) { - ret = wait_on_page_locked_killable(page); - if (!PageUptodate(page) && !ret) + ret = folio_wait_locked_killable(folio); + if (!folio_test_uptodate(folio) && !ret) ret = xchg(&desc.ctx->error, 0); } out: put_nfs_open_context(desc.ctx); - trace_nfs_aop_readpage_done(inode, page, ret); + trace_nfs_aop_readpage_done(inode, folio, ret); return ret; out_unlock: - unlock_page(page); - trace_nfs_aop_readpage_done(inode, page, ret); + folio_unlock(folio); + trace_nfs_aop_readpage_done(inode, folio, ret); return ret; } @@ -395,7 +388,7 @@ void nfs_readahead(struct readahead_control *ractl) struct file *file = ractl->file; struct nfs_readdesc desc; struct inode *inode = ractl->mapping->host; - struct page *page; + struct folio *folio; int ret; trace_nfs_aop_readahead(inode, readahead_pos(ractl), nr_pages); @@ -416,9 +409,8 @@ void nfs_readahead(struct readahead_control *ractl) nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); - while ((page = readahead_page(ractl)) != NULL) { - ret = readpage_async_filler(&desc, page); - put_page(page); + while ((folio = readahead_folio(ractl)) != NULL) { + ret = readpage_async_filler(&desc, folio); if (ret) break; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 80c240e50952..f4cca8f00c0c 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -25,6 +25,7 @@ #include <linux/freezer.h> #include <linux/wait.h> #include <linux/iversion.h> +#include <linux/filelock.h> #include <linux/uaccess.h> #include <linux/sched/mm.h> @@ -63,7 +64,7 @@ static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); static struct nfs_page * nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, - struct page *page); + struct folio *folio); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -170,31 +171,28 @@ nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) return 0; } -static struct nfs_page * -nfs_page_private_request(struct page *page) +static struct nfs_page *nfs_folio_private_request(struct folio *folio) { - if (!PagePrivate(page)) - return NULL; - return (struct nfs_page *)page_private(page); + return folio_get_private(folio); } -/* - * nfs_page_find_head_request_locked - find head request associated with @page +/** + * nfs_folio_find_private_request - find head request associated with a folio + * @folio: pointer to folio * * must be called while holding the inode lock. * * returns matching head request with reference held, or NULL if not found. */ -static struct nfs_page * -nfs_page_find_private_request(struct page *page) +static struct nfs_page *nfs_folio_find_private_request(struct folio *folio) { - struct address_space *mapping = page_file_mapping(page); + struct address_space *mapping = folio_file_mapping(folio); struct nfs_page *req; - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return NULL; spin_lock(&mapping->private_lock); - req = nfs_page_private_request(page); + req = nfs_folio_private_request(folio); if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); @@ -203,18 +201,17 @@ nfs_page_find_private_request(struct page *page) return req; } -static struct nfs_page * -nfs_page_find_swap_request(struct page *page) +static struct nfs_page *nfs_folio_find_swap_request(struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio_file_mapping(folio)->host; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *req = NULL; - if (!PageSwapCache(page)) + if (!folio_test_swapcache(folio)) return NULL; mutex_lock(&nfsi->commit_mutex); - if (PageSwapCache(page)) { + if (folio_test_swapcache(folio)) { req = nfs_page_search_commits_for_head_request_locked(nfsi, - page); + folio); if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); @@ -224,29 +221,30 @@ nfs_page_find_swap_request(struct page *page) return req; } -/* - * nfs_page_find_head_request - find head request associated with @page +/** + * nfs_folio_find_head_request - find head request associated with a folio + * @folio: pointer to folio * * returns matching head request with reference held, or NULL if not found. */ -static struct nfs_page *nfs_page_find_head_request(struct page *page) +static struct nfs_page *nfs_folio_find_head_request(struct folio *folio) { struct nfs_page *req; - req = nfs_page_find_private_request(page); + req = nfs_folio_find_private_request(folio); if (!req) - req = nfs_page_find_swap_request(page); + req = nfs_folio_find_swap_request(folio); return req; } -static struct nfs_page *nfs_find_and_lock_page_request(struct page *page) +static struct nfs_page *nfs_folio_find_and_lock_request(struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio_file_mapping(folio)->host; struct nfs_page *req, *head; int ret; for (;;) { - req = nfs_page_find_head_request(page); + req = nfs_folio_find_head_request(folio); if (!req) return req; head = nfs_page_group_lock_head(req); @@ -260,9 +258,9 @@ static struct nfs_page *nfs_find_and_lock_page_request(struct page *page) return ERR_PTR(ret); } /* Ensure that nobody removed the request before we locked it */ - if (head == nfs_page_private_request(page)) + if (head == nfs_folio_private_request(folio)) break; - if (PageSwapCache(page)) + if (folio_test_swapcache(folio)) break; nfs_unlock_and_release_request(head); } @@ -270,18 +268,19 @@ static struct nfs_page *nfs_find_and_lock_page_request(struct page *page) } /* Adjust the file length if we're writing beyond the end */ -static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) +static void nfs_grow_file(struct folio *folio, unsigned int offset, + unsigned int count) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio_file_mapping(folio)->host; loff_t end, i_size; pgoff_t end_index; spin_lock(&inode->i_lock); i_size = i_size_read(inode); - end_index = (i_size - 1) >> PAGE_SHIFT; - if (i_size > 0 && page_index(page) < end_index) + end_index = ((i_size - 1) >> folio_shift(folio)) << folio_order(folio); + if (i_size > 0 && folio_index(folio) < end_index) goto out; - end = page_file_offset(page) + ((loff_t)offset+count); + end = folio_file_pos(folio) + (loff_t)offset + (loff_t)count; if (i_size >= end) goto out; trace_nfs_size_grow(inode, end); @@ -307,11 +306,11 @@ static void nfs_set_pageerror(struct address_space *mapping) spin_unlock(&inode->i_lock); } -static void nfs_mapping_set_error(struct page *page, int error) +static void nfs_mapping_set_error(struct folio *folio, int error) { - struct address_space *mapping = page_file_mapping(page); + struct address_space *mapping = folio_file_mapping(folio); - SetPageError(page); + folio_set_error(folio); filemap_set_wb_err(mapping, error); if (mapping->host) errseq_set(&mapping->host->i_sb->s_wb_err, @@ -358,9 +357,9 @@ nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) */ static bool nfs_page_group_covers_page(struct nfs_page *req) { + unsigned int len = nfs_folio_length(nfs_page_to_folio(req)); struct nfs_page *tmp; unsigned int pos = 0; - unsigned int len = nfs_page_length(req->wb_page); nfs_page_group_lock(req); @@ -380,11 +379,13 @@ static bool nfs_page_group_covers_page(struct nfs_page *req) */ static void nfs_mark_uptodate(struct nfs_page *req) { - if (PageUptodate(req->wb_page)) + struct folio *folio = nfs_page_to_folio(req); + + if (folio_test_uptodate(folio)) return; if (!nfs_page_group_covers_page(req)) return; - SetPageUptodate(req->wb_page); + folio_mark_uptodate(folio); } static int wb_priority(struct writeback_control *wbc) @@ -406,35 +407,34 @@ int nfs_congestion_kb; #define NFS_CONGESTION_OFF_THRESH \ (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) -static void nfs_set_page_writeback(struct page *page) +static void nfs_folio_set_writeback(struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_server *nfss = NFS_SERVER(inode); - int ret = test_set_page_writeback(page); - - WARN_ON_ONCE(ret != 0); + struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host); - if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) + folio_start_writeback(folio); + if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH) nfss->write_congested = 1; } -static void nfs_end_page_writeback(struct nfs_page *req) +static void nfs_folio_end_writeback(struct folio *folio) { - struct inode *inode = page_file_mapping(req->wb_page)->host; - struct nfs_server *nfss = NFS_SERVER(inode); - bool is_done; + struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host); - is_done = nfs_page_group_sync_on_bit(req, PG_WB_END); - nfs_unlock_request(req); - if (!is_done) - return; - - end_page_writeback(req->wb_page); - if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) + folio_end_writeback(folio); + if (atomic_long_dec_return(&nfss->writeback) < + NFS_CONGESTION_OFF_THRESH) nfss->write_congested = 0; } +static void nfs_page_end_writeback(struct nfs_page *req) +{ + if (nfs_page_group_sync_on_bit(req, PG_WB_END)) { + nfs_unlock_request(req); + nfs_folio_end_writeback(nfs_page_to_folio(req)); + } else + nfs_unlock_request(req); +} + /* * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests * @@ -549,7 +549,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) /* * nfs_lock_and_join_requests - join all subreqs to the head req - * @page: the page used to lookup the "page group" of nfs_page structures + * @folio: the folio used to lookup the "page group" of nfs_page structures * * This function joins all sub requests to the head request by first * locking all requests in the group, cancelling any pending operations @@ -559,13 +559,12 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) * * Returns a locked, referenced pointer to the head request - which after * this call is guaranteed to be the only request associated with the page. - * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * Returns NULL if no requests are found for @folio, or a ERR_PTR if an * error was encountered. */ -static struct nfs_page * -nfs_lock_and_join_requests(struct page *page) +static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio_file_mapping(folio)->host; struct nfs_page *head; int ret; @@ -574,7 +573,7 @@ nfs_lock_and_join_requests(struct page *page) * reference to the whole page group - the group will not be destroyed * until the head reference is released. */ - head = nfs_find_and_lock_page_request(page); + head = nfs_folio_find_and_lock_request(folio); if (IS_ERR_OR_NULL(head)) return head; @@ -592,11 +591,10 @@ nfs_lock_and_join_requests(struct page *page) static void nfs_write_error(struct nfs_page *req, int error) { - trace_nfs_write_error(page_file_mapping(req->wb_page)->host, req, - error); - nfs_mapping_set_error(req->wb_page, error); + trace_nfs_write_error(nfs_page_to_inode(req), req, error); + nfs_mapping_set_error(nfs_page_to_folio(req), error); nfs_inode_remove_request(req); - nfs_end_page_writeback(req); + nfs_page_end_writeback(req); nfs_release_request(req); } @@ -604,21 +602,21 @@ static void nfs_write_error(struct nfs_page *req, int error) * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_async_flush(struct page *page, +static int nfs_page_async_flush(struct folio *folio, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { struct nfs_page *req; int ret = 0; - req = nfs_lock_and_join_requests(page); + req = nfs_lock_and_join_requests(folio); if (!req) goto out; ret = PTR_ERR(req); if (IS_ERR(req)) goto out; - nfs_set_page_writeback(page); + nfs_folio_set_writeback(folio); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); /* If there is a fatal error that covers this write, just exit */ @@ -636,12 +634,12 @@ static int nfs_page_async_flush(struct page *page, goto out_launder; if (wbc->sync_mode == WB_SYNC_NONE) ret = AOP_WRITEPAGE_ACTIVATE; - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); nfs_redirty_request(req); pgio->pg_error = 0; } else - nfs_add_stats(page_file_mapping(page)->host, - NFSIOS_WRITEPAGES, 1); + nfs_add_stats(folio_file_mapping(folio)->host, + NFSIOS_WRITEPAGES, 1); out: return ret; out_launder: @@ -649,21 +647,21 @@ out_launder: return 0; } -static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, +static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { - nfs_pageio_cond_complete(pgio, page_index(page)); - return nfs_page_async_flush(page, wbc, pgio); + nfs_pageio_cond_complete(pgio, folio_index(folio)); + return nfs_page_async_flush(folio, wbc, pgio); } /* * Write an mmapped page to the server. */ -static int nfs_writepage_locked(struct page *page, +static int nfs_writepage_locked(struct folio *folio, struct writeback_control *wbc) { struct nfs_pageio_descriptor pgio; - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio_file_mapping(folio)->host; int err; if (wbc->sync_mode == WB_SYNC_NONE && @@ -671,9 +669,9 @@ static int nfs_writepage_locked(struct page *page, return AOP_WRITEPAGE_ACTIVATE; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_pageio_init_write(&pgio, inode, 0, - false, &nfs_async_write_completion_ops); - err = nfs_do_writepage(page, wbc, &pgio); + nfs_pageio_init_write(&pgio, inode, 0, false, + &nfs_async_write_completion_ops); + err = nfs_do_writepage(folio, wbc, &pgio); pgio.pg_error = 0; nfs_pageio_complete(&pgio); return err; @@ -681,21 +679,23 @@ static int nfs_writepage_locked(struct page *page, int nfs_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); int ret; - ret = nfs_writepage_locked(page, wbc); + ret = nfs_writepage_locked(folio, wbc); if (ret != AOP_WRITEPAGE_ACTIVATE) unlock_page(page); return ret; } -static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) +static int nfs_writepages_callback(struct folio *folio, + struct writeback_control *wbc, void *data) { int ret; - ret = nfs_do_writepage(page, wbc, data); + ret = nfs_do_writepage(folio, wbc, data); if (ret != AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); + folio_unlock(folio); return ret; } @@ -749,10 +749,11 @@ out_err: /* * Insert a write request into an inode */ -static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +static void nfs_inode_add_request(struct nfs_page *req) { - struct address_space *mapping = page_file_mapping(req->wb_page); - struct nfs_inode *nfsi = NFS_I(inode); + struct folio *folio = nfs_page_to_folio(req); + struct address_space *mapping = folio_file_mapping(folio); + struct nfs_inode *nfsi = NFS_I(mapping->host); WARN_ON_ONCE(req->wb_this_page != req); @@ -764,10 +765,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) * with invalidate/truncate. */ spin_lock(&mapping->private_lock); - if (likely(!PageSwapCache(req->wb_page))) { + if (likely(!folio_test_swapcache(folio))) { set_bit(PG_MAPPED, &req->wb_flags); - SetPagePrivate(req->wb_page); - set_page_private(req->wb_page, (unsigned long)req); + folio_set_private(folio); + folio->private = req; } spin_unlock(&mapping->private_lock); atomic_long_inc(&nfsi->nrequests); @@ -784,47 +785,43 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { - struct address_space *mapping = page_file_mapping(req->wb_page); - struct inode *inode = mapping->host; - struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_page *head; - if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { - head = req->wb_head; + struct folio *folio = nfs_page_to_folio(req->wb_head); + struct address_space *mapping = folio_file_mapping(folio); spin_lock(&mapping->private_lock); - if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { - set_page_private(head->wb_page, 0); - ClearPagePrivate(head->wb_page); - clear_bit(PG_MAPPED, &head->wb_flags); + if (likely(folio && !folio_test_swapcache(folio))) { + folio->private = NULL; + folio_clear_private(folio); + clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } spin_unlock(&mapping->private_lock); } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { nfs_release_request(req); - atomic_long_dec(&nfsi->nrequests); + atomic_long_dec(&NFS_I(nfs_page_to_inode(req))->nrequests); } } -static void -nfs_mark_request_dirty(struct nfs_page *req) +static void nfs_mark_request_dirty(struct nfs_page *req) { - if (req->wb_page) - __set_page_dirty_nobuffers(req->wb_page); + struct folio *folio = nfs_page_to_folio(req); + if (folio) + filemap_dirty_folio(folio_mapping(folio), folio); } /* * nfs_page_search_commits_for_head_request_locked * - * Search through commit lists on @inode for the head request for @page. + * Search through commit lists on @inode for the head request for @folio. * Must be called while holding the inode (which is cinfo) lock. * * Returns the head request if found, or NULL if not found. */ static struct nfs_page * nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, - struct page *page) + struct folio *folio) { struct nfs_page *freq, *t; struct nfs_commit_info cinfo; @@ -833,13 +830,13 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, nfs_init_cinfo_from_inode(&cinfo, inode); /* search through pnfs commit lists */ - freq = pnfs_search_commit_reqs(inode, &cinfo, page); + freq = pnfs_search_commit_reqs(inode, &cinfo, folio); if (freq) return freq->wb_head; /* Linearly search the commit list for the correct request */ list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) { - if (freq->wb_page == page) + if (nfs_page_to_folio(freq) == folio) return freq->wb_head; } @@ -887,8 +884,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - if (req->wb_page) - nfs_mark_page_unstable(req->wb_page, cinfo); + nfs_folio_mark_unstable(nfs_page_to_folio(req), cinfo); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); @@ -947,12 +943,15 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, nfs_request_add_commit_list(req, cinfo); } -static void -nfs_clear_page_commit(struct page *page) +static void nfs_folio_clear_commit(struct folio *folio) { - dec_node_page_state(page, NR_WRITEBACK); - dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, - WB_WRITEBACK); + if (folio) { + long nr = folio_nr_pages(folio); + + node_stat_mod_folio(folio, NR_WRITEBACK, -nr); + wb_stat_mod(&inode_to_bdi(folio_file_mapping(folio)->host)->wb, + WB_WRITEBACK, -nr); + } } /* Called holding the request lock on @req */ @@ -970,7 +969,7 @@ nfs_clear_request_commit(struct nfs_page *req) nfs_request_remove_commit_list(req, &cinfo); } mutex_unlock(&NFS_I(inode)->commit_mutex); - nfs_clear_page_commit(req->wb_page); + nfs_folio_clear_commit(nfs_page_to_folio(req)); } } @@ -1002,7 +1001,8 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes < bytes)) { trace_nfs_comp_error(hdr->inode, req, hdr->error); - nfs_mapping_set_error(req->wb_page, hdr->error); + nfs_mapping_set_error(nfs_page_to_folio(req), + hdr->error); goto remove_req; } if (nfs_write_need_commit(hdr)) { @@ -1016,7 +1016,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) remove_req: nfs_inode_remove_request(req); next: - nfs_end_page_writeback(req); + nfs_page_end_writeback(req); nfs_release_request(req); } out: @@ -1092,10 +1092,9 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, * If the attempt fails, then the existing request is flushed out * to disk. */ -static struct nfs_page *nfs_try_to_update_request(struct inode *inode, - struct page *page, - unsigned int offset, - unsigned int bytes) +static struct nfs_page *nfs_try_to_update_request(struct folio *folio, + unsigned int offset, + unsigned int bytes) { struct nfs_page *req; unsigned int rqend; @@ -1104,7 +1103,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, end = offset + bytes; - req = nfs_lock_and_join_requests(page); + req = nfs_lock_and_join_requests(folio); if (IS_ERR_OR_NULL(req)) return req; @@ -1137,7 +1136,7 @@ out_flushme: */ nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); - error = nfs_wb_page(inode, page); + error = nfs_wb_folio(folio_file_mapping(folio)->host, folio); return (error < 0) ? ERR_PTR(error) : NULL; } @@ -1148,40 +1147,42 @@ out_flushme: * if we have to add a new request. Also assumes that the caller has * already called nfs_flush_incompatible() if necessary. */ -static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, - struct page *page, unsigned int offset, unsigned int bytes) +static struct nfs_page *nfs_setup_write_request(struct nfs_open_context *ctx, + struct folio *folio, + unsigned int offset, + unsigned int bytes) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *req; + struct nfs_page *req; - req = nfs_try_to_update_request(inode, page, offset, bytes); + req = nfs_try_to_update_request(folio, offset, bytes); if (req != NULL) goto out; - req = nfs_create_request(ctx, page, offset, bytes); + req = nfs_page_create_from_folio(ctx, folio, offset, bytes); if (IS_ERR(req)) goto out; - nfs_inode_add_request(inode, req); + nfs_inode_add_request(req); out: return req; } -static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, - unsigned int offset, unsigned int count) +static int nfs_writepage_setup(struct nfs_open_context *ctx, + struct folio *folio, unsigned int offset, + unsigned int count) { - struct nfs_page *req; + struct nfs_page *req; - req = nfs_setup_write_request(ctx, page, offset, count); + req = nfs_setup_write_request(ctx, folio, offset, count); if (IS_ERR(req)) return PTR_ERR(req); /* Update file length */ - nfs_grow_file(page, offset, count); + nfs_grow_file(folio, offset, count); nfs_mark_uptodate(req); nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); return 0; } -int nfs_flush_incompatible(struct file *file, struct page *page) +int nfs_flush_incompatible(struct file *file, struct folio *folio) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct nfs_lock_context *l_ctx; @@ -1197,12 +1198,12 @@ int nfs_flush_incompatible(struct file *file, struct page *page) * dropped page. */ do { - req = nfs_page_find_head_request(page); + req = nfs_folio_find_head_request(folio); if (req == NULL) return 0; l_ctx = req->wb_lock_context; - do_flush = req->wb_page != page || - !nfs_match_open_context(nfs_req_openctx(req), ctx); + do_flush = nfs_page_to_folio(req) != folio || + !nfs_match_open_context(nfs_req_openctx(req), ctx); if (l_ctx && flctx && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock))) { @@ -1211,7 +1212,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) nfs_release_request(req); if (!do_flush) return 0; - status = nfs_wb_page(page_file_mapping(page)->host, page); + status = nfs_wb_folio(folio_file_mapping(folio)->host, folio); } while (status == 0); return status; } @@ -1283,9 +1284,9 @@ out: * the PageUptodate() flag. In this case, we will need to turn off * write optimisations that depend on the page contents being correct. */ -static bool nfs_write_pageuptodate(struct page *page, struct inode *inode, - unsigned int pagelen) +static bool nfs_folio_write_uptodate(struct folio *folio, unsigned int pagelen) { + struct inode *inode = folio_file_mapping(folio)->host; struct nfs_inode *nfsi = NFS_I(inode); if (nfs_have_delegated_attributes(inode)) @@ -1299,7 +1300,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode, out: if (nfsi->cache_validity & NFS_INO_INVALID_DATA && pagelen != 0) return false; - return PageUptodate(page) != 0; + return folio_test_uptodate(folio) != 0; } static bool @@ -1317,16 +1318,17 @@ is_whole_file_wrlock(struct file_lock *fl) * If the file is opened for synchronous writes then we can just skip the rest * of the checks. */ -static int nfs_can_extend_write(struct file *file, struct page *page, - struct inode *inode, unsigned int pagelen) +static int nfs_can_extend_write(struct file *file, struct folio *folio, + unsigned int pagelen) { - int ret; + struct inode *inode = file_inode(file); struct file_lock_context *flctx = locks_inode_context(inode); struct file_lock *fl; + int ret; if (file->f_flags & O_DSYNC) return 0; - if (!nfs_write_pageuptodate(page, inode, pagelen)) + if (!nfs_folio_write_uptodate(folio, pagelen)) return 0; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) return 1; @@ -1358,33 +1360,33 @@ static int nfs_can_extend_write(struct file *file, struct page *page, * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad * things with a page scheduled for an RPC call (e.g. invalidate it). */ -int nfs_updatepage(struct file *file, struct page *page, - unsigned int offset, unsigned int count) +int nfs_update_folio(struct file *file, struct folio *folio, + unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct address_space *mapping = page_file_mapping(page); - struct inode *inode = mapping->host; - unsigned int pagelen = nfs_page_length(page); + struct address_space *mapping = folio_file_mapping(folio); + struct inode *inode = mapping->host; + unsigned int pagelen = nfs_folio_length(folio); int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); - dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", - file, count, (long long)(page_file_offset(page) + offset)); + dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count, + (long long)(folio_file_pos(folio) + offset)); if (!count) goto out; - if (nfs_can_extend_write(file, page, inode, pagelen)) { + if (nfs_can_extend_write(file, folio, pagelen)) { count = max(count + offset, pagelen); offset = 0; } - status = nfs_writepage_setup(ctx, page, offset, count); + status = nfs_writepage_setup(ctx, folio, offset, count); if (status < 0) nfs_set_pageerror(mapping); out: - dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", + dprintk("NFS: nfs_update_folio returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; } @@ -1420,13 +1422,13 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, */ static void nfs_redirty_request(struct nfs_page *req) { - struct nfs_inode *nfsi = NFS_I(page_file_mapping(req->wb_page)->host); + struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req)); /* Bump the transmission count */ req->wb_nio++; nfs_mark_request_dirty(req); atomic_long_inc(&nfsi->redirtied_pages); - nfs_end_page_writeback(req); + nfs_page_end_writeback(req); nfs_release_request(req); } @@ -1784,18 +1786,18 @@ void nfs_retry_commit(struct list_head *page_list, req = nfs_list_entry(page_list->next); nfs_list_remove_request(req); nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx); - if (!cinfo->dreq) - nfs_clear_page_commit(req->wb_page); + nfs_folio_clear_commit(nfs_page_to_folio(req)); nfs_unlock_and_release_request(req); } } EXPORT_SYMBOL_GPL(nfs_retry_commit); -static void -nfs_commit_resched_write(struct nfs_commit_info *cinfo, - struct nfs_page *req) +static void nfs_commit_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) { - __set_page_dirty_nobuffers(req->wb_page); + struct folio *folio = nfs_page_to_folio(req); + + filemap_dirty_folio(folio_mapping(folio), folio); } /* @@ -1846,12 +1848,13 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) int status = data->task.tk_status; struct nfs_commit_info cinfo; struct nfs_server *nfss; + struct folio *folio; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (req->wb_page) - nfs_clear_page_commit(req->wb_page); + folio = nfs_page_to_folio(req); + nfs_folio_clear_commit(folio); dprintk("NFS: commit (%s/%llu %d@%lld)", nfs_req_openctx(req)->dentry->d_sb->s_id, @@ -1859,10 +1862,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) req->wb_bytes, (long long)req_offset(req)); if (status < 0) { - if (req->wb_page) { + if (folio) { trace_nfs_commit_error(data->inode, req, status); - nfs_mapping_set_error(req->wb_page, status); + nfs_mapping_set_error(folio, status); nfs_inode_remove_request(req); } dprintk_cont(", error = %d\n", status); @@ -1873,7 +1876,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) * returned by the server against all stored verfs. */ if (nfs_write_match_verf(verf, req)) { /* We have a match */ - if (req->wb_page) + if (folio) nfs_inode_remove_request(req); dprintk_cont(" OK\n"); goto next; @@ -2054,7 +2057,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) /* blocking call to cancel all requests and join to a single (head) * request */ - req = nfs_lock_and_join_requests(&folio->page); + req = nfs_lock_and_join_requests(folio); if (IS_ERR(req)) { ret = PTR_ERR(req); @@ -2070,13 +2073,18 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) return ret; } -/* - * Write back all requests on one page - we do this before reading it. +/** + * nfs_wb_folio - Write back all requests on one page + * @inode: pointer to page + * @folio: pointer to folio + * + * Assumes that the folio has been locked by the caller, and will + * not unlock it. */ -int nfs_wb_page(struct inode *inode, struct page *page) +int nfs_wb_folio(struct inode *inode, struct folio *folio) { - loff_t range_start = page_file_offset(page); - loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1); + loff_t range_start = folio_file_pos(folio); + loff_t range_end = range_start + (loff_t)folio_size(folio) - 1; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, @@ -2085,25 +2093,25 @@ int nfs_wb_page(struct inode *inode, struct page *page) }; int ret; - trace_nfs_writeback_page_enter(inode); + trace_nfs_writeback_folio(inode, folio); for (;;) { - wait_on_page_writeback(page); - if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc); + folio_wait_writeback(folio); + if (folio_clear_dirty_for_io(folio)) { + ret = nfs_writepage_locked(folio, &wbc); if (ret < 0) goto out_error; continue; } ret = 0; - if (!PagePrivate(page)) + if (!folio_test_private(folio)) break; ret = nfs_commit_inode(inode, FLUSH_SYNC); if (ret < 0) goto out_error; } out_error: - trace_nfs_writeback_page_exit(inode, ret); + trace_nfs_writeback_folio_done(inode, folio, ret); return ret; } diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 0a9b72685f98..1479583fbb62 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -9,6 +9,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/fs.h> +#include <linux/filelock.h> static unsigned int grace_net_id; static DEFINE_SPINLOCK(grace_lock); diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c deleted file mode 100644 index 76bee0a0d308..000000000000 --- a/fs/nfsd/fault_inject.c +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com> - * - * Uses debugfs to create fault injection points for client testing - */ - -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/debugfs.h> -#include <linux/module.h> -#include <linux/nsproxy.h> -#include <linux/sunrpc/addr.h> -#include <linux/uaccess.h> -#include <linux/kernel.h> - -#include "state.h" -#include "netns.h" - -struct nfsd_fault_inject_op { - char *file; - u64 (*get)(void); - u64 (*set_val)(u64); - u64 (*set_clnt)(struct sockaddr_storage *, size_t); -}; - -static struct dentry *debug_dir; - -static ssize_t fault_inject_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - static u64 val; - char read_buf[25]; - size_t size; - loff_t pos = *ppos; - struct nfsd_fault_inject_op *op = file_inode(file)->i_private; - - if (!pos) - val = op->get(); - size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); - - return simple_read_from_buffer(buf, len, ppos, read_buf, size); -} - -static ssize_t fault_inject_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) -{ - char write_buf[INET6_ADDRSTRLEN]; - size_t size = min(sizeof(write_buf) - 1, len); - struct net *net = current->nsproxy->net_ns; - struct sockaddr_storage sa; - struct nfsd_fault_inject_op *op = file_inode(file)->i_private; - u64 val; - char *nl; - - if (copy_from_user(write_buf, buf, size)) - return -EFAULT; - write_buf[size] = '\0'; - - /* Deal with any embedded newlines in the string */ - nl = strchr(write_buf, '\n'); - if (nl) { - size = nl - write_buf; - *nl = '\0'; - } - - size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa)); - if (size > 0) { - val = op->set_clnt(&sa, size); - if (val) - pr_info("NFSD [%s]: Client %s had %llu state object(s)\n", - op->file, write_buf, val); - } else { - val = simple_strtoll(write_buf, NULL, 0); - if (val == 0) - pr_info("NFSD Fault Injection: %s (all)", op->file); - else - pr_info("NFSD Fault Injection: %s (n = %llu)", - op->file, val); - val = op->set_val(val); - pr_info("NFSD: %s: found %llu", op->file, val); - } - return len; /* on success, claim we got the whole input */ -} - -static const struct file_operations fops_nfsd = { - .owner = THIS_MODULE, - .read = fault_inject_read, - .write = fault_inject_write, -}; - -void nfsd_fault_inject_cleanup(void) -{ - debugfs_remove_recursive(debug_dir); -} - -static struct nfsd_fault_inject_op inject_ops[] = { - { - .file = "forget_clients", - .get = nfsd_inject_print_clients, - .set_val = nfsd_inject_forget_clients, - .set_clnt = nfsd_inject_forget_client, - }, - { - .file = "forget_locks", - .get = nfsd_inject_print_locks, - .set_val = nfsd_inject_forget_locks, - .set_clnt = nfsd_inject_forget_client_locks, - }, - { - .file = "forget_openowners", - .get = nfsd_inject_print_openowners, - .set_val = nfsd_inject_forget_openowners, - .set_clnt = nfsd_inject_forget_client_openowners, - }, - { - .file = "forget_delegations", - .get = nfsd_inject_print_delegations, - .set_val = nfsd_inject_forget_delegations, - .set_clnt = nfsd_inject_forget_client_delegations, - }, - { - .file = "recall_delegations", - .get = nfsd_inject_print_delegations, - .set_val = nfsd_inject_recall_delegations, - .set_clnt = nfsd_inject_recall_client_delegations, - }, -}; - -void nfsd_fault_inject_init(void) -{ - unsigned int i; - struct nfsd_fault_inject_op *op; - umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; - - debug_dir = debugfs_create_dir("nfsd", NULL); - - for (i = 0; i < ARRAY_SIZE(inject_ops); i++) { - op = &inject_ops[i]; - debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd); - } -} diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 0ef070349014..6e8712bd7c99 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -331,37 +331,27 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) return nf; } +/** + * nfsd_file_check_write_error - check for writeback errors on a file + * @nf: nfsd_file to check for writeback errors + * + * Check whether a nfsd_file has an unseen error. Reset the write + * verifier if so. + */ static void -nfsd_file_fsync(struct nfsd_file *nf) -{ - struct file *file = nf->nf_file; - int ret; - - if (!file || !(file->f_mode & FMODE_WRITE)) - return; - ret = vfs_fsync(file, 1); - trace_nfsd_file_fsync(nf, ret); - if (ret) - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); -} - -static int nfsd_file_check_write_error(struct nfsd_file *nf) { struct file *file = nf->nf_file; - if (!file || !(file->f_mode & FMODE_WRITE)) - return 0; - return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)); + if ((file->f_mode & FMODE_WRITE) && + filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err))) + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); } static void nfsd_file_hash_remove(struct nfsd_file *nf) { trace_nfsd_file_unhash(nf); - - if (nfsd_file_check_write_error(nf)) - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash, nfsd_file_rhash_params); } @@ -387,23 +377,12 @@ nfsd_file_free(struct nfsd_file *nf) this_cpu_add(nfsd_file_total_age, age); nfsd_file_unhash(nf); - - /* - * We call fsync here in order to catch writeback errors. It's not - * strictly required by the protocol, but an nfsd_file could get - * evicted from the cache before a COMMIT comes in. If another - * task were to open that file in the interim and scrape the error, - * then the client may never see it. By calling fsync here, we ensure - * that writeback happens before the entry is freed, and that any - * errors reported result in the write verifier changing. - */ - nfsd_file_fsync(nf); - if (nf->nf_mark) nfsd_file_mark_put(nf->nf_mark); if (nf->nf_file) { get_file(nf->nf_file); filp_close(nf->nf_file, NULL); + nfsd_file_check_write_error(nf); fput(nf->nf_file); } @@ -452,7 +431,7 @@ static bool nfsd_file_lru_remove(struct nfsd_file *nf) struct nfsd_file * nfsd_file_get(struct nfsd_file *nf) { - if (likely(refcount_inc_not_zero(&nf->nf_ref))) + if (nf && refcount_inc_not_zero(&nf->nf_ref)) return nf; return NULL; } @@ -662,6 +641,39 @@ static struct shrinker nfsd_file_shrinker = { }; /** + * nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file + * @nf: nfsd_file to attempt to queue + * @dispose: private list to queue successfully-put objects + * + * Unhash an nfsd_file, try to get a reference to it, and then put that + * reference. If it's the last reference, queue it to the dispose list. + */ +static void +nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose) + __must_hold(RCU) +{ + int decrement = 1; + + /* If we raced with someone else unhashing, ignore it */ + if (!nfsd_file_unhash(nf)) + return; + + /* If we can't get a reference, ignore it */ + if (!nfsd_file_get(nf)) + return; + + /* Extra decrement if we remove from the LRU */ + if (nfsd_file_lru_remove(nf)) + ++decrement; + + /* If refcount goes to 0, then put on the dispose list */ + if (refcount_sub_and_test(decrement, &nf->nf_ref)) { + list_add(&nf->nf_lru, dispose); + trace_nfsd_file_closing(nf); + } +} + +/** * nfsd_file_queue_for_close: try to close out any open nfsd_files for an inode * @inode: inode on which to close out nfsd_files * @dispose: list on which to gather nfsd_files to close out @@ -688,30 +700,11 @@ nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose) rcu_read_lock(); do { - int decrement = 1; - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, nfsd_file_rhash_params); if (!nf) break; - - /* If we raced with someone else unhashing, ignore it */ - if (!nfsd_file_unhash(nf)) - continue; - - /* If we can't get a reference, ignore it */ - if (!nfsd_file_get(nf)) - continue; - - /* Extra decrement if we remove from the LRU */ - if (nfsd_file_lru_remove(nf)) - ++decrement; - - /* If refcount goes to 0, then put on the dispose list */ - if (refcount_sub_and_test(decrement, &nf->nf_ref)) { - list_add(&nf->nf_lru, dispose); - trace_nfsd_file_closing(nf); - } + nfsd_file_cond_queue(nf, dispose); } while (1); rcu_read_unlock(); } @@ -928,11 +921,8 @@ __nfsd_file_cache_purge(struct net *net) nf = rhashtable_walk_next(&iter); while (!IS_ERR_OR_NULL(nf)) { - if (!net || nf->nf_net == net) { - nfsd_file_unhash(nf); - nfsd_file_lru_remove(nf); - list_add(&nf->nf_lru, &dispose); - } + if (!net || nf->nf_net == net) + nfsd_file_cond_queue(nf, &dispose); nf = rhashtable_walk_next(&iter); } @@ -1096,8 +1086,7 @@ retry: rcu_read_lock(); nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, nfsd_file_rhash_params); - if (nf) - nf = nfsd_file_get(nf); + nf = nfsd_file_get(nf); rcu_read_unlock(); if (nf) { @@ -1148,6 +1137,7 @@ wait_for_construction: out: if (status == nfs_ok) { this_cpu_inc(nfsd_file_acquisitions); + nfsd_file_check_write_error(nf); *pnf = nf; } else { if (refcount_dec_and_test(&nf->nf_ref)) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 51a4b7885cae..ec49b200b797 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -10,6 +10,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <linux/filelock.h> #include <linux/percpu_counter.h> #include <linux/siphash.h> diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 1457f59f447a..12b2b9bc07bf 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -113,11 +113,11 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) inode_lock(inode); - error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS, + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_ACCESS, argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT, + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_DEFAULT, argp->acl_default); if (error) goto out_drop_lock; @@ -377,10 +377,11 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { }, }; -static unsigned int nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)]); const struct svc_version nfsd_acl_version2 = { .vs_vers = 2, - .vs_nproc = 5, + .vs_nproc = ARRAY_SIZE(nfsd_acl_procedures2), .vs_proc = nfsd_acl_procedures2, .vs_count = nfsd_acl_count2, .vs_dispatch = nfsd_dispatch, diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 647108138e8a..73adca47d373 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -103,11 +103,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) inode_lock(inode); - error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS, + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_ACCESS, argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT, + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_DEFAULT, argp->acl_default); out_drop_lock: @@ -266,10 +266,11 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { }, }; -static unsigned int nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)]); const struct svc_version nfsd_acl_version3 = { .vs_vers = 3, - .vs_nproc = 3, + .vs_nproc = ARRAY_SIZE(nfsd_acl_procedures3), .vs_proc = nfsd_acl_procedures3, .vs_count = nfsd_acl_count3, .vs_dispatch = nfsd_dispatch, diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index d01b29aba662..e6bb8eeb5bc2 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -320,7 +320,7 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, iap->ia_mode &= ~current_umask(); fh_fill_pre_attrs(fhp); - host_err = vfs_create(&init_user_ns, inode, child, iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true); if (host_err < 0) { status = nfserrno(host_err); goto out; @@ -1064,10 +1064,11 @@ static const struct svc_procedure nfsd_procedures3[22] = { }, }; -static unsigned int nfsd_count3[ARRAY_SIZE(nfsd_procedures3)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_count3[ARRAY_SIZE(nfsd_procedures3)]); const struct svc_version nfsd_version3 = { .vs_vers = 3, - .vs_nproc = 22, + .vs_nproc = ARRAY_SIZE(nfsd_procedures3), .vs_proc = nfsd_procedures3, .vs_dispatch = nfsd_dispatch, .vs_count = nfsd_count3, diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 3564d1c6f610..e8a80052cb1b 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -323,11 +323,11 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) if (ls->ls_recalled) goto out_unlock; - ls->ls_recalled = true; - atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); if (list_empty(&ls->ls_layouts)) goto out_unlock; + ls->ls_recalled = true; + atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid); refcount_inc(&ls->ls_stid.sc_count); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f189ba7995f5..5ae670807449 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1214,8 +1214,10 @@ out: return status; out_put_dst: nfsd_file_put(*dst); + *dst = NULL; out_put_src: nfsd_file_put(*src); + *src = NULL; goto out; } @@ -1293,15 +1295,15 @@ extern void nfs_sb_deactive(struct super_block *sb); * setup a work entry in the ssc delayed unmount list. */ static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr, - struct nfsd4_ssc_umount_item **retwork, struct vfsmount **ss_mnt) + struct nfsd4_ssc_umount_item **nsui) { struct nfsd4_ssc_umount_item *ni = NULL; struct nfsd4_ssc_umount_item *work = NULL; struct nfsd4_ssc_umount_item *tmp; DEFINE_WAIT(wait); + __be32 status = 0; - *ss_mnt = NULL; - *retwork = NULL; + *nsui = NULL; work = kzalloc(sizeof(*work), GFP_KERNEL); try_again: spin_lock(&nn->nfsd_ssc_lock); @@ -1325,12 +1327,12 @@ try_again: finish_wait(&nn->nfsd_ssc_waitq, &wait); goto try_again; } - *ss_mnt = ni->nsui_vfsmount; + *nsui = ni; refcount_inc(&ni->nsui_refcnt); spin_unlock(&nn->nfsd_ssc_lock); kfree(work); - /* return vfsmount in ss_mnt */ + /* return vfsmount in (*nsui)->nsui_vfsmount */ return 0; } if (work) { @@ -1338,31 +1340,32 @@ try_again: refcount_set(&work->nsui_refcnt, 2); work->nsui_busy = true; list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list); - *retwork = work; - } + *nsui = work; + } else + status = nfserr_resource; spin_unlock(&nn->nfsd_ssc_lock); - return 0; + return status; } -static void nfsd4_ssc_update_dul_work(struct nfsd_net *nn, - struct nfsd4_ssc_umount_item *work, struct vfsmount *ss_mnt) +static void nfsd4_ssc_update_dul(struct nfsd_net *nn, + struct nfsd4_ssc_umount_item *nsui, + struct vfsmount *ss_mnt) { - /* set nsui_vfsmount, clear busy flag and wakeup waiters */ spin_lock(&nn->nfsd_ssc_lock); - work->nsui_vfsmount = ss_mnt; - work->nsui_busy = false; + nsui->nsui_vfsmount = ss_mnt; + nsui->nsui_busy = false; wake_up_all(&nn->nfsd_ssc_waitq); spin_unlock(&nn->nfsd_ssc_lock); } -static void nfsd4_ssc_cancel_dul_work(struct nfsd_net *nn, - struct nfsd4_ssc_umount_item *work) +static void nfsd4_ssc_cancel_dul(struct nfsd_net *nn, + struct nfsd4_ssc_umount_item *nsui) { spin_lock(&nn->nfsd_ssc_lock); - list_del(&work->nsui_list); + list_del(&nsui->nsui_list); wake_up_all(&nn->nfsd_ssc_waitq); spin_unlock(&nn->nfsd_ssc_lock); - kfree(work); + kfree(nsui); } /* @@ -1370,7 +1373,7 @@ static void nfsd4_ssc_cancel_dul_work(struct nfsd_net *nn, */ static __be32 nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, - struct vfsmount **mount) + struct nfsd4_ssc_umount_item **nsui) { struct file_system_type *type; struct vfsmount *ss_mnt; @@ -1381,7 +1384,6 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, char *ipaddr, *dev_name, *raw_data; int len, raw_len; __be32 status = nfserr_inval; - struct nfsd4_ssc_umount_item *work = NULL; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); naddr = &nss->u.nl4_addr; @@ -1389,6 +1391,7 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, naddr->addr_len, (struct sockaddr *)&tmp_addr, sizeof(tmp_addr)); + *nsui = NULL; if (tmp_addrlen == 0) goto out_err; @@ -1431,10 +1434,10 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, goto out_free_rawdata; snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep); - status = nfsd4_ssc_setup_dul(nn, ipaddr, &work, &ss_mnt); + status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui); if (status) goto out_free_devname; - if (ss_mnt) + if ((*nsui)->nsui_vfsmount) goto out_done; /* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */ @@ -1442,15 +1445,12 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, module_put(type->owner); if (IS_ERR(ss_mnt)) { status = nfserr_nodev; - if (work) - nfsd4_ssc_cancel_dul_work(nn, work); + nfsd4_ssc_cancel_dul(nn, *nsui); goto out_free_devname; } - if (work) - nfsd4_ssc_update_dul_work(nn, work, ss_mnt); + nfsd4_ssc_update_dul(nn, *nsui, ss_mnt); out_done: status = 0; - *mount = ss_mnt; out_free_devname: kfree(dev_name); @@ -1474,7 +1474,7 @@ out_err: static __be32 nfsd4_setup_inter_ssc(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - struct nfsd4_copy *copy, struct vfsmount **mount) + struct nfsd4_copy *copy) { struct svc_fh *s_fh = NULL; stateid_t *s_stid = ©->cp_src_stateid; @@ -1487,7 +1487,7 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp, if (status) goto out; - status = nfsd4_interssc_connect(copy->cp_src, rqstp, mount); + status = nfsd4_interssc_connect(copy->cp_src, rqstp, ©->ss_nsui); if (status) goto out; @@ -1505,45 +1505,26 @@ out: } static void -nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp, +nfsd4_cleanup_inter_ssc(struct nfsd4_ssc_umount_item *nsui, struct file *filp, struct nfsd_file *dst) { - bool found = false; - long timeout; - struct nfsd4_ssc_umount_item *tmp; - struct nfsd4_ssc_umount_item *ni = NULL; struct nfsd_net *nn = net_generic(dst->nf_net, nfsd_net_id); + long timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout); nfs42_ssc_close(filp); - nfsd_file_put(dst); fput(filp); - if (!nn) { - mntput(ss_mnt); - return; - } spin_lock(&nn->nfsd_ssc_lock); - timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout); - list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { - if (ni->nsui_vfsmount->mnt_sb == ss_mnt->mnt_sb) { - list_del(&ni->nsui_list); - /* - * vfsmount can be shared by multiple exports, - * decrement refcnt. If the count drops to 1 it - * will be unmounted when nsui_expire expires. - */ - refcount_dec(&ni->nsui_refcnt); - ni->nsui_expire = jiffies + timeout; - list_add_tail(&ni->nsui_list, &nn->nfsd_ssc_mount_list); - found = true; - break; - } - } + list_del(&nsui->nsui_list); + /* + * vfsmount can be shared by multiple exports, + * decrement refcnt. If the count drops to 1 it + * will be unmounted when nsui_expire expires. + */ + refcount_dec(&nsui->nsui_refcnt); + nsui->nsui_expire = jiffies + timeout; + list_add_tail(&nsui->nsui_list, &nn->nfsd_ssc_mount_list); spin_unlock(&nn->nfsd_ssc_lock); - if (!found) { - mntput(ss_mnt); - return; - } } #else /* CONFIG_NFSD_V4_2_INTER_SSC */ @@ -1551,15 +1532,13 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp, static __be32 nfsd4_setup_inter_ssc(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - struct nfsd4_copy *copy, - struct vfsmount **mount) + struct nfsd4_copy *copy) { - *mount = NULL; return nfserr_inval; } static void -nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp, +nfsd4_cleanup_inter_ssc(struct nfsd4_ssc_umount_item *nsui, struct file *filp, struct nfsd_file *dst) { } @@ -1582,13 +1561,6 @@ nfsd4_setup_intra_ssc(struct svc_rqst *rqstp, ©->nf_dst); } -static void -nfsd4_cleanup_intra_ssc(struct nfsd_file *src, struct nfsd_file *dst) -{ - nfsd_file_put(src); - nfsd_file_put(dst); -} - static void nfsd4_cb_offload_release(struct nfsd4_callback *cb) { struct nfsd4_cb_offload *cbo = @@ -1700,18 +1672,27 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) memcpy(dst->cp_src, src->cp_src, sizeof(struct nl4_server)); memcpy(&dst->stateid, &src->stateid, sizeof(src->stateid)); memcpy(&dst->c_fh, &src->c_fh, sizeof(src->c_fh)); - dst->ss_mnt = src->ss_mnt; + dst->ss_nsui = src->ss_nsui; +} + +static void release_copy_files(struct nfsd4_copy *copy) +{ + if (copy->nf_src) + nfsd_file_put(copy->nf_src); + if (copy->nf_dst) + nfsd_file_put(copy->nf_dst); } static void cleanup_async_copy(struct nfsd4_copy *copy) { nfs4_free_copy_state(copy); - nfsd_file_put(copy->nf_dst); - if (!nfsd4_ssc_is_inter(copy)) - nfsd_file_put(copy->nf_src); - spin_lock(©->cp_clp->async_lock); - list_del(©->copies); - spin_unlock(©->cp_clp->async_lock); + release_copy_files(copy); + if (copy->cp_clp) { + spin_lock(©->cp_clp->async_lock); + if (!list_empty(©->copies)) + list_del_init(©->copies); + spin_unlock(©->cp_clp->async_lock); + } nfs4_put_copy(copy); } @@ -1749,8 +1730,8 @@ static int nfsd4_do_async_copy(void *data) if (nfsd4_ssc_is_inter(copy)) { struct file *filp; - filp = nfs42_ssc_open(copy->ss_mnt, ©->c_fh, - ©->stateid); + filp = nfs42_ssc_open(copy->ss_nsui->nsui_vfsmount, + ©->c_fh, ©->stateid); if (IS_ERR(filp)) { switch (PTR_ERR(filp)) { case -EBADF: @@ -1764,11 +1745,10 @@ static int nfsd4_do_async_copy(void *data) } nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file, false); - nfsd4_cleanup_inter_ssc(copy->ss_mnt, filp, copy->nf_dst); + nfsd4_cleanup_inter_ssc(copy->ss_nsui, filp, copy->nf_dst); } else { nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file, copy->nf_dst->nf_file, false); - nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst); } do_callback: @@ -1790,8 +1770,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_notsupp; goto out; } - status = nfsd4_setup_inter_ssc(rqstp, cstate, copy, - ©->ss_mnt); + status = nfsd4_setup_inter_ssc(rqstp, cstate, copy); if (status) return nfserr_offload_denied; } else { @@ -1810,12 +1789,13 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); if (!async_copy) goto out_err; + INIT_LIST_HEAD(&async_copy->copies); + refcount_set(&async_copy->refcount, 1); async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL); if (!async_copy->cp_src) goto out_err; if (!nfs4_init_copy_state(nn, copy)) goto out_err; - refcount_set(&async_copy->refcount, 1); memcpy(©->cp_res.cb_stateid, ©->cp_stateid.cs_stid, sizeof(copy->cp_res.cb_stateid)); dup_copy_fields(copy, async_copy); @@ -1832,38 +1812,53 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } else { status = nfsd4_do_copy(copy, copy->nf_src->nf_file, copy->nf_dst->nf_file, true); - nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst); } out: + release_copy_files(copy); return status; out_err: + if (nfsd4_ssc_is_inter(copy)) { + /* + * Source's vfsmount of inter-copy will be unmounted + * by the laundromat. Use copy instead of async_copy + * since async_copy->ss_nsui might not be set yet. + */ + refcount_dec(©->ss_nsui->nsui_refcnt); + } if (async_copy) cleanup_async_copy(async_copy); status = nfserrno(-ENOMEM); - /* - * source's vfsmount of inter-copy will be unmounted - * by the laundromat - */ goto out; } -struct nfsd4_copy * -find_async_copy(struct nfs4_client *clp, stateid_t *stateid) +static struct nfsd4_copy * +find_async_copy_locked(struct nfs4_client *clp, stateid_t *stateid) { struct nfsd4_copy *copy; - spin_lock(&clp->async_lock); + lockdep_assert_held(&clp->async_lock); + list_for_each_entry(copy, &clp->async_copies, copies) { if (memcmp(©->cp_stateid.cs_stid, stateid, NFS4_STATEID_SIZE)) continue; - refcount_inc(©->refcount); - spin_unlock(&clp->async_lock); return copy; } - spin_unlock(&clp->async_lock); return NULL; } +static struct nfsd4_copy * +find_async_copy(struct nfs4_client *clp, stateid_t *stateid) +{ + struct nfsd4_copy *copy; + + spin_lock(&clp->async_lock); + copy = find_async_copy_locked(clp, stateid); + if (copy) + refcount_inc(©->refcount); + spin_unlock(&clp->async_lock); + return copy; +} + static __be32 nfsd4_offload_cancel(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -1948,22 +1943,24 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd_file_put(nf); return status; } + static __be32 nfsd4_offload_status(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_offload_status *os = &u->offload_status; - __be32 status = 0; + __be32 status = nfs_ok; struct nfsd4_copy *copy; struct nfs4_client *clp = cstate->clp; - copy = find_async_copy(clp, &os->stateid); - if (copy) { + spin_lock(&clp->async_lock); + copy = find_async_copy_locked(clp, &os->stateid); + if (copy) os->count = copy->cp_res.wr_bytes_written; - nfs4_put_copy(copy); - } else + else status = nfserr_bad_stateid; + spin_unlock(&clp->async_lock); return status; } @@ -3619,12 +3616,13 @@ static const struct svc_procedure nfsd_procedures4[2] = { }, }; -static unsigned int nfsd_count3[ARRAY_SIZE(nfsd_procedures4)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_count4[ARRAY_SIZE(nfsd_procedures4)]); const struct svc_version nfsd_version4 = { .vs_vers = 4, - .vs_nproc = 2, + .vs_nproc = ARRAY_SIZE(nfsd_procedures4), .vs_proc = nfsd_procedures4, - .vs_count = nfsd_count3, + .vs_count = nfsd_count4, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS4_SVC_XDRSIZE, .vs_rpcb_optnl = true, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 78b8cd9651d5..3509e73abe1f 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -233,7 +233,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = vfs_mkdir(&init_user_ns, d_inode(dir), dentry, S_IRWXU); + status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); out_put: dput(dentry); out_unlock: @@ -353,7 +353,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) status = -ENOENT; if (d_really_is_negative(dentry)) goto out; - status = vfs_rmdir(&init_user_ns, d_inode(dir), dentry); + status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry); out: dput(dentry); out_unlock: @@ -443,7 +443,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) if (nfs4_has_reclaimed_state(name, nn)) goto out_free; - status = vfs_rmdir(&init_user_ns, d_inode(parent), child); + status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child); if (status) printk("failed to remove client recovery directory %pd\n", child); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4ef529379065..6e61fa3acaf1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -600,23 +600,15 @@ put_nfs4_file(struct nfs4_file *fi) } static struct nfsd_file * -__nfs4_get_fd(struct nfs4_file *f, int oflag) -{ - if (f->fi_fds[oflag]) - return nfsd_file_get(f->fi_fds[oflag]); - return NULL; -} - -static struct nfsd_file * find_writeable_file_locked(struct nfs4_file *f) { struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); - ret = __nfs4_get_fd(f, O_WRONLY); + ret = nfsd_file_get(f->fi_fds[O_WRONLY]); if (!ret) - ret = __nfs4_get_fd(f, O_RDWR); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); return ret; } @@ -639,9 +631,9 @@ find_readable_file_locked(struct nfs4_file *f) lockdep_assert_held(&f->fi_lock); - ret = __nfs4_get_fd(f, O_RDONLY); + ret = nfsd_file_get(f->fi_fds[O_RDONLY]); if (!ret) - ret = __nfs4_get_fd(f, O_RDWR); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); return ret; } @@ -665,11 +657,11 @@ find_any_file(struct nfs4_file *f) if (!f) return NULL; spin_lock(&f->fi_lock); - ret = __nfs4_get_fd(f, O_RDWR); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); if (!ret) { - ret = __nfs4_get_fd(f, O_WRONLY); + ret = nfsd_file_get(f->fi_fds[O_WRONLY]); if (!ret) - ret = __nfs4_get_fd(f, O_RDONLY); + ret = nfsd_file_get(f->fi_fds[O_RDONLY]); } spin_unlock(&f->fi_lock); return ret; @@ -688,15 +680,6 @@ static struct nfsd_file *find_any_file_locked(struct nfs4_file *f) return NULL; } -static struct nfsd_file *find_deleg_file_locked(struct nfs4_file *f) -{ - lockdep_assert_held(&f->fi_lock); - - if (f->fi_deleg_file) - return f->fi_deleg_file; - return NULL; -} - static atomic_long_t num_delegations; unsigned long max_delegations; @@ -992,7 +975,6 @@ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid, stid->cs_stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time; stid->cs_stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; - stid->cs_type = cs_type; idr_preload(GFP_KERNEL); spin_lock(&nn->s2s_cp_lock); @@ -1003,6 +985,7 @@ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid, idr_preload_end(); if (new_id < 0) return 0; + stid->cs_type = cs_type; return 1; } @@ -1036,7 +1019,8 @@ void nfs4_free_copy_state(struct nfsd4_copy *copy) { struct nfsd_net *nn; - WARN_ON_ONCE(copy->cp_stateid.cs_type != NFS4_COPY_STID); + if (copy->cp_stateid.cs_type != NFS4_COPY_STID) + return; nn = net_generic(copy->cp_clp->net, nfsd_net_id); spin_lock(&nn->s2s_cp_lock); idr_remove(&nn->s2s_cp_stateids, @@ -2705,7 +2689,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) ds = delegstateid(st); nf = st->sc_file; spin_lock(&nf->fi_lock); - file = find_deleg_file_locked(nf); + file = nf->fi_deleg_file; if (!file) goto out; @@ -5298,16 +5282,17 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, /* test and set deny mode */ spin_lock(&fp->fi_lock); status = nfs4_file_check_deny(fp, open->op_share_deny); - if (status == nfs_ok) { - if (status != nfserr_share_denied) { - set_deny(open->op_share_deny, stp); - fp->fi_share_deny |= - (open->op_share_deny & NFS4_SHARE_DENY_BOTH); - } else { - if (nfs4_resolve_deny_conflicts_locked(fp, false, - stp, open->op_share_deny, false)) - status = nfserr_jukebox; - } + switch (status) { + case nfs_ok: + set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= + (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + break; + case nfserr_share_denied: + if (nfs4_resolve_deny_conflicts_locked(fp, false, + stp, open->op_share_deny, false)) + status = nfserr_jukebox; + break; } spin_unlock(&fp->fi_lock); @@ -5356,7 +5341,7 @@ static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, { struct nfs4_ol_stateid *st; struct file *f = fp->fi_deleg_file->nf_file; - struct inode *ino = locks_inode(f); + struct inode *ino = file_inode(f); int writes; writes = atomic_read(&ino->i_writecount); @@ -5438,6 +5423,23 @@ nfsd4_verify_deleg_dentry(struct nfsd4_open *open, struct nfs4_file *fp, return 0; } +/* + * We avoid breaking delegations held by a client due to its own activity, but + * clearing setuid/setgid bits on a write is an implicit activity and the client + * may not notice and continue using the old mode. Avoid giving out a delegation + * on setuid/setgid files when the client is requesting an open for write. + */ +static int +nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf) +{ + struct inode *inode = file_inode(nf->nf_file); + + if ((open->op_share_access & NFS4_SHARE_ACCESS_WRITE) && + (inode->i_mode & (S_ISUID|S_ISGID))) + return -EAGAIN; + return 0; +} + static struct nfs4_delegation * nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *parent) @@ -5471,6 +5473,8 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) status = -EAGAIN; + else if (nfsd4_verify_setuid_write(open, nf)) + status = -EAGAIN; else if (!fp->fi_deleg_file) { fp->fi_deleg_file = nf; /* increment early to prevent fi_deleg_file from being @@ -5511,6 +5515,14 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (status) goto out_unlock; + /* + * Now that the deleg is set, check again to ensure that nothing + * raced in and changed the mode while we weren't lookng. + */ + status = nfsd4_verify_setuid_write(open, fp->fi_deleg_file); + if (status) + goto out_unlock; + spin_lock(&state_lock); spin_lock(&fp->fi_lock); if (fp->fi_had_conflict) @@ -6406,23 +6418,26 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, static struct nfsd_file * nfs4_find_file(struct nfs4_stid *s, int flags) { + struct nfsd_file *ret = NULL; + if (!s) return NULL; switch (s->sc_type) { case NFS4_DELEG_STID: - if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file)) - return NULL; - return nfsd_file_get(s->sc_file->fi_deleg_file); + spin_lock(&s->sc_file->fi_lock); + ret = nfsd_file_get(s->sc_file->fi_deleg_file); + spin_unlock(&s->sc_file->fi_lock); + break; case NFS4_OPEN_STID: case NFS4_LOCK_STID: if (flags & RD_STATE) - return find_readable_file(s->sc_file); + ret = find_readable_file(s->sc_file); else - return find_writeable_file(s->sc_file); + ret = find_writeable_file(s->sc_file); } - return NULL; + return ret; } static __be32 @@ -6547,8 +6562,19 @@ void nfs4_put_cpntf_state(struct nfsd_net *nn, struct nfs4_cpntf_state *cps) spin_unlock(&nn->s2s_cp_lock); } -/* - * Checks for stateid operations +/** + * nfs4_preprocess_stateid_op - find and prep stateid for an operation + * @rqstp: incoming request from client + * @cstate: current compound state + * @fhp: filehandle associated with requested stateid + * @stateid: stateid (provided by client) + * @flags: flags describing type of operation to be done + * @nfp: optional nfsd_file return pointer (may be NULL) + * @cstid: optional returned nfs4_stid pointer (may be NULL) + * + * Given info from the client, look up a nfs4_stid for the operation. On + * success, it returns a reference to the nfs4_stid and/or the nfsd_file + * associated with it. */ __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, @@ -6737,8 +6763,18 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ return status; } -/* - * Checks for sequence id mutating operations. +/** + * nfs4_preprocess_seqid_op - find and prep an ol_stateid for a seqid-morphing op + * @cstate: compund state + * @seqid: seqid (provided by client) + * @stateid: stateid (provided by client) + * @typemask: mask of allowable types for this operation + * @stpp: return pointer for the stateid found + * @nn: net namespace for request + * + * Given a stateid+seqid from a client, look up an nfs4_ol_stateid and + * return it in @stpp. On a nfs_ok return, the returned stateid will + * have its st_mutex locked. */ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, @@ -7809,7 +7845,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) return status; } - inode = locks_inode(nf->nf_file); + inode = file_inode(nf->nf_file); flctx = locks_inode_context(inode); if (flctx && !list_empty_careful(&flctx->flc_posix)) { @@ -8182,7 +8218,6 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); - rhltable_destroy(&nfs4_file_rhltable); #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_shutdown_umount(nn); #endif @@ -8192,6 +8227,7 @@ void nfs4_state_shutdown(void) { nfsd4_destroy_callback_queue(); + rhltable_destroy(&nfs4_file_rhltable); } static void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 97edb32be77f..e12e5a4ad502 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2965,7 +2965,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, goto out; } - err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + err = vfs_getattr(&path, &stat, + STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, + AT_STATX_SYNC_AS_STAT); if (err) goto out_nfserr; if (!(stat.result_mask & STATX_BTIME)) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 3e64a3d50a1c..041faa13b852 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -488,7 +488,7 @@ found_entry: case RC_NOCACHE: break; case RC_REPLSTAT: - svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat); + xdr_stream_encode_be32(&rqstp->rq_res_stream, rp->c_replstat); rtn = RC_REPLY; break; case RC_REPLBUFF: @@ -509,7 +509,7 @@ out_trace: * nfsd_cache_update - Update an entry in the duplicate reply cache. * @rqstp: svc_rqst with a finished Reply * @cachetype: which cache to update - * @statp: Reply's status code + * @statp: pointer to Reply's NFS status code, or NULL * * This is called from nfsd_dispatch when the procedure has been * executed and the complete reply is in rqstp->rq_res. diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c2577ee7ffb2..7b8f17ee5224 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -14,7 +14,6 @@ #include <linux/lockd/lockd.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/gss_api.h> -#include <linux/sunrpc/gss_krb5_enctypes.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/module.h> #include <linux/fsnotify.h> @@ -47,7 +46,6 @@ enum { NFSD_MaxBlkSize, NFSD_MaxConnections, NFSD_Filecache, - NFSD_SupportedEnctypes, /* * The below MUST come last. Otherwise we leave a hole in nfsd_files[] * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops @@ -187,16 +185,6 @@ static int export_features_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(export_features); -#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) -static int supported_enctypes_show(struct seq_file *m, void *v) -{ - seq_printf(m, KRB5_SUPPORTED_ENCTYPES); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(supported_enctypes); -#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ - static const struct file_operations pool_stats_operations = { .open = nfsd_pool_stats_open, .read = seq_read, @@ -1150,6 +1138,9 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) inode->i_op = &simple_dir_inode_operations; inc_nlink(inode); break; + case S_IFLNK: + inode->i_op = &simple_symlink_inode_operations; + break; default: break; } @@ -1195,6 +1186,54 @@ out_err: goto out; } +#if IS_ENABLED(CONFIG_SUNRPC_GSS) +static int __nfsd_symlink(struct inode *dir, struct dentry *dentry, + umode_t mode, const char *content) +{ + struct inode *inode; + + inode = nfsd_get_inode(dir->i_sb, mode); + if (!inode) + return -ENOMEM; + + inode->i_link = (char *)content; + inode->i_size = strlen(content); + + d_add(dentry, inode); + inc_nlink(dir); + fsnotify_create(dir, dentry); + return 0; +} + +/* + * @content is assumed to be a NUL-terminated string that lives + * longer than the symlink itself. + */ +static void nfsd_symlink(struct dentry *parent, const char *name, + const char *content) +{ + struct inode *dir = parent->d_inode; + struct dentry *dentry; + int ret; + + inode_lock(dir); + dentry = d_alloc_name(parent, name); + if (!dentry) + goto out; + ret = __nfsd_symlink(d_inode(parent), dentry, S_IFLNK | 0777, content); + if (ret) + dput(dentry); +out: + inode_unlock(dir); +} +#else +static inline void nfsd_symlink(struct dentry *parent, const char *name, + const char *content) +{ +} + +#endif + static void clear_ncl(struct inode *inode) { struct nfsdfs_client *ncl = inode->i_private; @@ -1355,10 +1394,6 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO}, -#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) - [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", - &supported_enctypes_fops, S_IRUGO}, -#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, @@ -1371,6 +1406,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) ret = simple_fill_super(sb, 0x6e667364, nfsd_files); if (ret) return ret; + nfsd_symlink(sb->s_root, "supported_krb5_enctypes", + "/proc/net/rpc/gss_krb5_enctypes"); dentry = nfsd_mkdir(sb->s_root, NULL, "clients"); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -1458,16 +1495,11 @@ static __net_init int nfsd_init_net(struct net *net) nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; nfsd4_init_leases_net(nn); - retval = nfsd_reply_cache_init(nn); - if (retval) - goto out_cache_error; get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); return 0; -out_cache_error: - nfsd_idmap_shutdown(net); out_idmap_error: nfsd_export_shutdown(net); out_export_error: @@ -1476,9 +1508,6 @@ out_export_error: static __net_exit void nfsd_exit_net(struct net *net) { - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - nfsd_reply_cache_shutdown(nn); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); nfsd_netns_free_versions(net_generic(net, nfsd_net_id)); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index fa0144a74267..d88498f8b275 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -86,7 +86,7 @@ bool nfssvc_encode_voidres(struct svc_rqst *rqstp, * Function prototypes. */ int nfsd_svc(int nrservs, struct net *net, const struct cred *cred); -int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); +int nfsd_dispatch(struct svc_rqst *rqstp); int nfsd_nrthreads(struct net *); int nfsd_nrpools(struct net *); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 8c52b6c9d31a..ccd8485fee04 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -40,7 +40,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) /* make sure parents give x permission to user */ int err; parent = dget_parent(tdentry); - err = inode_permission(&init_user_ns, + err = inode_permission(&nop_mnt_idmap, d_inode(parent), MAY_EXEC); if (err < 0) { dput(parent); @@ -628,6 +628,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) stat.mtime = inode->i_mtime; stat.ctime = inode->i_ctime; stat.size = inode->i_size; + if (v4 && IS_I_VERSION(inode)) { + stat.change_cookie = inode_query_iversion(inode); + stat.result_mask |= STATX_CHANGE_COOKIE; + } } if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); @@ -659,6 +663,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp) if (err) { fhp->fh_post_saved = false; fhp->fh_post_attr.ctime = inode->i_ctime; + if (v4 && IS_I_VERSION(inode)) { + fhp->fh_post_attr.change_cookie = inode_query_iversion(inode); + fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE; + } } else fhp->fh_post_saved = true; if (v4) @@ -748,3 +756,37 @@ enum fsid_source fsid_source(const struct svc_fh *fhp) return FSIDSOURCE_UUID; return FSIDSOURCE_DEV; } + +/* + * We could use i_version alone as the change attribute. However, i_version + * can go backwards on a regular file after an unclean shutdown. On its own + * that doesn't necessarily cause a problem, but if i_version goes backwards + * and then is incremented again it could reuse a value that was previously + * used before boot, and a client who queried the two values might incorrectly + * assume nothing changed. + * + * By using both ctime and the i_version counter we guarantee that as long as + * time doesn't go backwards we never reuse an old value. If the filesystem + * advertises STATX_ATTR_CHANGE_MONOTONIC, then this mitigation is not + * needed. + * + * We only need to do this for regular files as well. For directories, we + * assume that the new change attr is always logged to stable storage in some + * fashion before the results can be seen. + */ +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) +{ + u64 chattr; + + if (stat->result_mask & STATX_CHANGE_COOKIE) { + chattr = stat->change_cookie; + if (S_ISREG(inode->i_mode) && + !(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) { + chattr += (u64)stat->ctime.tv_sec << 30; + chattr += stat->ctime.tv_nsec; + } + } else { + chattr = time_to_chattr(&stat->ctime); + } + return chattr; +} diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 513e028b0bbe..4e0ecf0ae2cf 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -293,34 +293,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) fhp->fh_pre_saved = false; } -/* - * We could use i_version alone as the change attribute. However, - * i_version can go backwards after a reboot. On its own that doesn't - * necessarily cause a problem, but if i_version goes backwards and then - * is incremented again it could reuse a value that was previously used - * before boot, and a client who queried the two values might - * incorrectly assume nothing changed. - * - * By using both ctime and the i_version counter we guarantee that as - * long as time doesn't go backwards we never reuse an old value. - */ -static inline u64 nfsd4_change_attribute(struct kstat *stat, - struct inode *inode) -{ - if (inode->i_sb->s_export_op->fetch_iversion) - return inode->i_sb->s_export_op->fetch_iversion(inode); - else if (IS_I_VERSION(inode)) { - u64 chattr; - - chattr = stat->ctime.tv_sec; - chattr <<= 30; - chattr += stat->ctime.tv_nsec; - chattr += inode_query_iversion(inode); - return chattr; - } else - return time_to_chattr(&stat->ctime); -} - +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode); extern void fh_fill_pre_attrs(struct svc_fh *fhp); extern void fh_fill_post_attrs(struct svc_fh *fhp); extern void fh_fill_both_attrs(struct svc_fh *fhp); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 9744443c3965..c37195572fd0 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -93,7 +93,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) if (delta < 0) delta = -delta; if (delta < MAX_TOUCH_TIME_ERROR && - setattr_prepare(&init_user_ns, fhp->fh_dentry, iap) != 0) { + setattr_prepare(&nop_mnt_idmap, fhp->fh_dentry, iap) != 0) { /* * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. * This will cause notify_change to set these times @@ -838,11 +838,11 @@ static const struct svc_procedure nfsd_procedures2[18] = { }, }; - -static unsigned int nfsd_count2[ARRAY_SIZE(nfsd_procedures2)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_count2[ARRAY_SIZE(nfsd_procedures2)]); const struct svc_version nfsd_version2 = { .vs_vers = 2, - .vs_nproc = 18, + .vs_nproc = ARRAY_SIZE(nfsd_procedures2), .vs_proc = nfsd_procedures2, .vs_count = nfsd_count2, .vs_dispatch = nfsd_dispatch, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 325d3d3f1211..9c7b1ef5be40 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -363,7 +363,7 @@ void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) do { read_seqbegin_or_lock(&nn->writeverf_lock, &seq); - memcpy(verf, nn->writeverf, sizeof(*verf)); + memcpy(verf, nn->writeverf, sizeof(nn->writeverf)); } while (need_seqretry(&nn->writeverf_lock, seq)); done_seqretry(&nn->writeverf_lock, seq); } @@ -427,16 +427,23 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred) ret = nfsd_file_cache_start_net(net); if (ret) goto out_lockd; - ret = nfs4_state_start_net(net); + + ret = nfsd_reply_cache_init(nn); if (ret) goto out_filecache; + ret = nfs4_state_start_net(net); + if (ret) + goto out_reply_cache; + #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_init_umount_work(nn); #endif nn->nfsd_net_up = true; return 0; +out_reply_cache: + nfsd_reply_cache_shutdown(nn); out_filecache: nfsd_file_cache_shutdown_net(net); out_lockd: @@ -454,6 +461,7 @@ static void nfsd_shutdown_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs4_state_shutdown_net(net); + nfsd_reply_cache_shutdown(nn); nfsd_file_cache_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); @@ -1022,7 +1030,6 @@ out: /** * nfsd_dispatch - Process an NFS or NFSACL Request * @rqstp: incoming request - * @statp: pointer to location of accept_stat field in RPC Reply buffer * * This RPC dispatcher integrates the NFS server's duplicate reply cache. * @@ -1030,9 +1037,10 @@ out: * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ -int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) +int nfsd_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *proc = rqstp->rq_procinfo; + __be32 *statp = rqstp->rq_accept_statp; /* * Give the xdr decoder a chance to change this if it wants @@ -1040,7 +1048,6 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) */ rqstp->rq_cachetype = proc->pc_cachetype; - svcxdr_init_decode(rqstp); if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; @@ -1053,12 +1060,6 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) goto out_dropit; } - /* - * Need to grab the location to store the status, as - * NFSv4 does some encoding while processing - */ - svcxdr_init_encode(rqstp); - *statp = proc->pc_func(rqstp); if (test_bit(RQ_DROPME, &rqstp->rq_flags)) goto out_update_drop; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e94634d30591..d49d3060ed4f 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -705,8 +705,6 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); void put_nfs4_file(struct nfs4_file *fi); -extern struct nfsd4_copy * -find_async_copy(struct nfs4_client *clp, stateid_t *staetid); extern void nfs4_put_cpntf_state(struct nfsd_net *nn, struct nfs4_cpntf_state *cps); extern __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st, diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 8f9c82d9e075..4183819ea082 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1202,37 +1202,6 @@ TRACE_EVENT(nfsd_file_close, ) ); -TRACE_EVENT(nfsd_file_fsync, - TP_PROTO( - const struct nfsd_file *nf, - int ret - ), - TP_ARGS(nf, ret), - TP_STRUCT__entry( - __field(void *, nf_inode) - __field(int, nf_ref) - __field(int, ret) - __field(unsigned long, nf_flags) - __field(unsigned char, nf_may) - __field(struct file *, nf_file) - ), - TP_fast_assign( - __entry->nf_inode = nf->nf_inode; - __entry->nf_ref = refcount_read(&nf->nf_ref); - __entry->ret = ret; - __entry->nf_flags = nf->nf_flags; - __entry->nf_may = nf->nf_may; - __entry->nf_file = nf->nf_file; - ), - TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p ret=%d", - __entry->nf_inode, - __entry->nf_ref, - show_nf_flags(__entry->nf_flags), - show_nfsd_may_flags(__entry->nf_may), - __entry->nf_file, __entry->ret - ) -); - #include "cache.h" TRACE_DEFINE_ENUM(RC_DROPIT); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 4c3a0d84043c..e7462b5e5f1e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -126,9 +126,13 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct dentry *dentry = *dpp; struct path path = {.mnt = mntget(exp->ex_path.mnt), .dentry = dget(dentry)}; + unsigned int follow_flags = 0; int err = 0; - err = follow_down(&path); + if (exp->ex_flags & NFSEXP_CROSSMOUNT) + follow_flags = LOOKUP_AUTOMOUNT; + + err = follow_down(&path, follow_flags); if (err < 0) goto out; if (path.mnt == exp->ex_path.mnt && path.dentry == dentry && @@ -223,7 +227,7 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) return 1; if (nfsd4_is_junction(dentry)) return 1; - if (d_mountpoint(dentry)) + if (d_managed(dentry)) /* * Might only be a mountpoint in a different namespace, * but we need to check. @@ -426,7 +430,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) if (iap->ia_size < 0) return -EFBIG; - host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL); + host_err = notify_change(&nop_mnt_idmap, dentry, &size_attr, NULL); if (host_err) return host_err; iap->ia_valid &= ~ATTR_SIZE; @@ -444,7 +448,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) return 0; iap->ia_valid |= ATTR_CTIME; - return notify_change(&init_user_ns, dentry, iap, NULL); + return notify_change(&nop_mnt_idmap, dentry, iap, NULL); } /** @@ -542,12 +546,12 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, attr->na_labelerr = security_inode_setsecctx(dentry, attr->na_seclabel->data, attr->na_seclabel->len); if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl) - attr->na_aclerr = set_posix_acl(&init_user_ns, + attr->na_aclerr = set_posix_acl(&nop_mnt_idmap, dentry, ACL_TYPE_ACCESS, attr->na_pacl); if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && !attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode)) - attr->na_aclerr = set_posix_acl(&init_user_ns, + attr->na_aclerr = set_posix_acl(&nop_mnt_idmap, dentry, ACL_TYPE_DEFAULT, attr->na_dpacl); inode_unlock(inode); @@ -583,7 +587,7 @@ int nfsd4_is_junction(struct dentry *dentry) return 0; if (!(inode->i_mode & S_ISVTX)) return 0; - if (vfs_getxattr(&init_user_ns, dentry, NFSD_JUNCTION_XATTR_NAME, + if (vfs_getxattr(&nop_mnt_idmap, dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0) return 0; return 1; @@ -1363,12 +1367,13 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = 0; switch (type) { case S_IFREG: - host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, dirp, dchild, + iap->ia_mode, true); if (!host_err) nfsd_check_ignore_resizing(iap); break; case S_IFDIR: - host_err = vfs_mkdir(&init_user_ns, dirp, dchild, iap->ia_mode); + host_err = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); if (!host_err && unlikely(d_unhashed(dchild))) { struct dentry *d; d = lookup_one_len(dchild->d_name.name, @@ -1396,7 +1401,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, case S_IFBLK: case S_IFIFO: case S_IFSOCK: - host_err = vfs_mknod(&init_user_ns, dirp, dchild, + host_err = vfs_mknod(&nop_mnt_idmap, dirp, dchild, iap->ia_mode, rdev); break; default: @@ -1557,7 +1562,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_drop_write; } fh_fill_pre_attrs(fhp); - host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path); + host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path); err = nfserrno(host_err); cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); if (!err) @@ -1625,7 +1630,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (d_really_is_negative(dold)) goto out_dput; fh_fill_pre_attrs(ffhp); - host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL); + host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL); fh_fill_post_attrs(ffhp); inode_unlock(dirp); if (!host_err) { @@ -1745,10 +1750,10 @@ retry: goto out_dput_old; } else { struct renamedata rd = { - .old_mnt_userns = &init_user_ns, + .old_mnt_idmap = &nop_mnt_idmap, .old_dir = fdir, .old_dentry = odentry, - .new_mnt_userns = &init_user_ns, + .new_mnt_idmap = &nop_mnt_idmap, .new_dir = tdir, .new_dentry = ndentry, }; @@ -1850,14 +1855,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, nfsd_close_cached_files(rdentry); for (retries = 1;;) { - host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL); + host_err = vfs_unlink(&nop_mnt_idmap, dirp, rdentry, NULL); if (host_err != -EAGAIN || !retries--) break; if (!nfsd_wait_for_delegreturn(rqstp, rinode)) break; } } else { - host_err = vfs_rmdir(&init_user_ns, dirp, rdentry); + host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry); } fh_fill_post_attrs(fhp); @@ -2129,7 +2134,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, inode_lock_shared(inode); - len = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0); + len = vfs_getxattr(&nop_mnt_idmap, dentry, name, NULL, 0); /* * Zero-length attribute, just return. @@ -2156,7 +2161,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, goto out; } - len = vfs_getxattr(&init_user_ns, dentry, name, buf, len); + len = vfs_getxattr(&nop_mnt_idmap, dentry, name, buf, len); if (len <= 0) { kvfree(buf); buf = NULL; @@ -2267,7 +2272,7 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) inode_lock(fhp->fh_dentry->d_inode); fh_fill_pre_attrs(fhp); - ret = __vfs_removexattr_locked(&init_user_ns, fhp->fh_dentry, + ret = __vfs_removexattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, NULL); fh_fill_post_attrs(fhp); @@ -2294,7 +2299,7 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, inode_lock(fhp->fh_dentry->d_inode); fh_fill_pre_attrs(fhp); - ret = __vfs_setxattr_locked(&init_user_ns, fhp->fh_dentry, name, buf, + ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, buf, len, flags, NULL); fh_fill_post_attrs(fhp); inode_unlock(fhp->fh_dentry->d_inode); @@ -2378,14 +2383,14 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, return 0; /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ - err = inode_permission(&init_user_ns, inode, + err = inode_permission(&nop_mnt_idmap, inode, acc & (MAY_READ | MAY_WRITE | MAY_EXEC)); /* Allow read access to binaries even when mode 111 */ if (err == -EACCES && S_ISREG(inode->i_mode) && (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) - err = inode_permission(&init_user_ns, inode, MAY_EXEC); + err = inode_permission(&nop_mnt_idmap, inode, MAY_EXEC); return err? nfserrno(err) : 0; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index dbdfef7ae85b..43fb57a301d3 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -170,9 +170,14 @@ static inline void fh_drop_write(struct svc_fh *fh) static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) { + u32 request_mask = STATX_BASIC_STATS; struct path p = {.mnt = fh->fh_export->ex_path.mnt, .dentry = fh->fh_dentry}; - return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS, + + if (fh->fh_maxsize == NFS4_FHSIZE) + request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); + + return nfserrno(vfs_getattr(&p, stat, request_mask, AT_STATX_SYNC_AS_STAT)); } diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 4fd2cf6d1d2d..510978e602da 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -571,7 +571,7 @@ struct nfsd4_copy { struct task_struct *copy_task; refcount_t refcount; - struct vfsmount *ss_mnt; + struct nfsd4_ssc_umount_item *ss_nsui; struct nfs_fh c_fh; nfs4_stateid stateid; }; diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 254d102e79c9..7d59567465e1 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -2,6 +2,7 @@ config NILFS2_FS tristate "NILFS2 file system support" select CRC32 + select LEGACY_DIRECT_IO help NILFS2 is a log-structured file system (LFS) supporting continuous snapshotting. In addition to versioning capability of the entire diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index e74fda212620..e956f886a1a1 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -188,7 +188,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, struct page *opage = obh->b_page; lock_page(opage); retry: - /* BUG_ON(oldkey != obh->b_page->index); */ + /* BUG_ON(oldkey != obh->b_folio->index); */ if (unlikely(oldkey != opage->index)) NILFS_PAGE_BUG(opage, "invalid oldkey %lld (newkey=%lld)", diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 40ce92a332fe..2681a449edc1 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -398,7 +398,7 @@ int nilfs_btree_broken_node_block(struct buffer_head *bh) if (buffer_nilfs_checked(bh)) return 0; - inode = bh->b_page->mapping->host; + inode = bh->b_folio->mapping->host; ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data, bh->b_size, inode, bh->b_blocknr); if (likely(!ret)) @@ -2150,7 +2150,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; struct address_space *btcache = btnc_inode->i_mapping; struct list_head lists[NILFS_BTREE_LEVEL_MAX]; - struct pagevec pvec; + struct folio_batch fbatch; struct buffer_head *bh, *head; pgoff_t index = 0; int level, i; @@ -2160,19 +2160,19 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, level++) INIT_LIST_HEAD(&lists[level]); - pagevec_init(&pvec); + folio_batch_init(&fbatch); - while (pagevec_lookup_tag(&pvec, btcache, &index, - PAGECACHE_TAG_DIRTY)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - bh = head = page_buffers(pvec.pages[i]); + while (filemap_get_folios_tag(btcache, &index, (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + bh = head = folio_buffers(fbatch.folios[i]); do { if (buffer_dirty(bh)) nilfs_btree_add_dirty_buffer(btree, lists, bh); } while ((bh = bh->b_this_page) != head); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 9930fa901039..9cf6ba58f585 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -40,8 +40,21 @@ static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat) static int nilfs_dat_prepare_entry(struct inode *dat, struct nilfs_palloc_req *req, int create) { - return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr, - create, &req->pr_entry_bh); + int ret; + + ret = nilfs_palloc_get_entry_block(dat, req->pr_entry_nr, + create, &req->pr_entry_bh); + if (unlikely(ret == -ENOENT)) { + nilfs_err(dat->i_sb, + "DAT doesn't have a block to manage vblocknr = %llu", + (unsigned long long)req->pr_entry_nr); + /* + * Return internal code -EINVAL to notify bmap layer of + * metadata corruption. + */ + ret = -EINVAL; + } + return ret; } static void nilfs_dat_commit_entry(struct inode *dat, @@ -123,11 +136,7 @@ static void nilfs_dat_commit_free(struct inode *dat, int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) { - int ret; - - ret = nilfs_dat_prepare_entry(dat, req, 0); - WARN_ON(ret == -ENOENT); - return ret; + return nilfs_dat_prepare_entry(dat, req, 0); } void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, @@ -149,19 +158,19 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; + __u64 start; sector_t blocknr; void *kaddr; int ret; ret = nilfs_dat_prepare_entry(dat, req, 0); - if (ret < 0) { - WARN_ON(ret == -ENOENT); + if (ret < 0) return ret; - } kaddr = kmap_atomic(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); + start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); kunmap_atomic(kaddr); @@ -172,6 +181,15 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) return ret; } } + if (unlikely(start > nilfs_mdt_cno(dat))) { + nilfs_err(dat->i_sb, + "vblocknr = %llu has abnormal lifetime: start cno (= %llu) > current cno (= %llu)", + (unsigned long long)req->pr_entry_nr, + (unsigned long long)start, + (unsigned long long)nilfs_mdt_cno(dat)); + nilfs_dat_abort_entry(dat, req); + return -EINVAL; + } return 0; } diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index b0d22ff24b67..48fe71d309cb 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -140,7 +140,7 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) { wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - struct inode *inode = bh->b_page->mapping->host; + struct inode *inode = bh->b_folio->mapping->host; nilfs_err(inode->i_sb, "I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)", diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 232dd7b6cca1..1310d2d5feb3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -364,7 +364,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) ii->i_bh = bh; atomic64_inc(&root->inodes_count); - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = ino; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -949,7 +949,7 @@ void nilfs_evict_inode(struct inode *inode) */ } -int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int nilfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct nilfs_transaction_info ti; @@ -957,7 +957,7 @@ int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct super_block *sb = inode->i_sb; int err; - err = setattr_prepare(&init_user_ns, dentry, iattr); + err = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (err) return err; @@ -972,7 +972,7 @@ int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, nilfs_truncate(inode); } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { @@ -988,7 +988,7 @@ out_err: return err; } -int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int nilfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct nilfs_root *root = NILFS_I(inode)->i_root; @@ -997,7 +997,7 @@ int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 87e1004b606d..5ccc638ae92f 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -128,7 +128,7 @@ int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) /** * nilfs_fileattr_set - ioctl to support chattr */ -int nilfs_fileattr_set(struct user_namespace *mnt_userns, +int nilfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -1114,7 +1114,14 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) minseg = range[0] + segbytes - 1; do_div(minseg, segbytes); + + if (range[1] < 4096) + goto out; + maxseg = NILFS_SB2_OFFSET_BYTES(range[1]); + if (maxseg < segbytes) + goto out; + do_div(maxseg, segbytes); maxseg--; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index cbf4fa60eea2..19c8158605ed 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -563,7 +563,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) struct page *page; int blkbits = inode->i_blkbits; - page = grab_cache_page(shadow->inode->i_mapping, bh->b_page->index); + page = grab_cache_page(shadow->inode->i_mapping, bh->b_folio->index); if (!page) return -ENOMEM; @@ -595,7 +595,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) struct page *page; int n; - page = find_lock_page(shadow->inode->i_mapping, bh->b_page->index); + page = find_lock_page(shadow->inode->i_mapping, bh->b_folio->index); if (page) { if (page_has_buffers(page)) { n = bh_offset(bh) >> inode->i_blkbits; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 23899e0ae850..c7024da8f1e2 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -72,7 +72,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int nilfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int nilfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -100,7 +100,7 @@ static int nilfs_create(struct user_namespace *mnt_userns, struct inode *dir, } static int -nilfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +nilfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -125,7 +125,7 @@ nilfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int nilfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int nilfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct nilfs_transaction_info ti; @@ -202,7 +202,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, return err; } -static int nilfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -340,7 +340,7 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) return err; } -static int nilfs_rename(struct user_namespace *mnt_userns, +static int nilfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index aecda4fc95f5..8046490cd7fe 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -242,7 +242,7 @@ extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); /* ioctl.c */ int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *m); -int nilfs_fileattr_set(struct user_namespace *mnt_userns, +int nilfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long nilfs_ioctl(struct file *, unsigned int, unsigned long); long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -271,10 +271,10 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode); extern void nilfs_update_inode(struct inode *, struct buffer_head *, int); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); -extern int nilfs_setattr(struct user_namespace *, struct dentry *, +extern int nilfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void nilfs_write_failed(struct address_space *mapping, loff_t to); -int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int nilfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 39b7eea2642a..41ccd43cd979 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -240,42 +240,43 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) int nilfs_copy_dirty_pages(struct address_space *dmap, struct address_space *smap) { - struct pagevec pvec; + struct folio_batch fbatch; unsigned int i; pgoff_t index = 0; int err = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: - if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY)) + if (!filemap_get_folios_tag(smap, &index, (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch)) return 0; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i], *dpage; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i], *dfolio; - lock_page(page); - if (unlikely(!PageDirty(page))) - NILFS_PAGE_BUG(page, "inconsistent dirty state"); + folio_lock(folio); + if (unlikely(!folio_test_dirty(folio))) + NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state"); - dpage = grab_cache_page(dmap, page->index); - if (unlikely(!dpage)) { + dfolio = filemap_grab_folio(dmap, folio->index); + if (unlikely(!dfolio)) { /* No empty page is added to the page cache */ err = -ENOMEM; - unlock_page(page); + folio_unlock(folio); break; } - if (unlikely(!page_has_buffers(page))) - NILFS_PAGE_BUG(page, + if (unlikely(!folio_buffers(folio))) + NILFS_PAGE_BUG(&folio->page, "found empty page in dat page cache"); - nilfs_copy_page(dpage, page, 1); - __set_page_dirty_nobuffers(dpage); + nilfs_copy_page(&dfolio->page, &folio->page, 1); + filemap_dirty_folio(folio_mapping(dfolio), dfolio); - unlock_page(dpage); - put_page(dpage); - unlock_page(page); + folio_unlock(dfolio); + folio_put(dfolio); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (likely(!err)) @@ -357,22 +358,22 @@ repeat: */ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) { - struct pagevec pvec; + struct folio_batch fbatch; unsigned int i; pgoff_t index = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); - while (pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + while (filemap_get_folios_tag(mapping, &index, (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; - lock_page(page); - nilfs_clear_dirty_page(page, silent); - unlock_page(page); + folio_lock(folio); + nilfs_clear_dirty_page(&folio->page, silent); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 76c3bd88b858..19446a8243d7 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -680,7 +680,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, loff_t start, loff_t end) { struct address_space *mapping = inode->i_mapping; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = 0, last = ULONG_MAX; size_t ndirties = 0; int i; @@ -694,23 +694,26 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, index = start >> PAGE_SHIFT; last = end >> PAGE_SHIFT; } - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: if (unlikely(index > last) || - !pagevec_lookup_range_tag(&pvec, mapping, &index, last, - PAGECACHE_TAG_DIRTY)) + !filemap_get_folios_tag(mapping, &index, last, + PAGECACHE_TAG_DIRTY, &fbatch)) return ndirties; - for (i = 0; i < pagevec_count(&pvec); i++) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { struct buffer_head *bh, *head; - struct page *page = pvec.pages[i]; + struct folio *folio = fbatch.folios[i]; - lock_page(page); - if (!page_has_buffers(page)) - create_empty_buffers(page, i_blocksize(inode), 0); - unlock_page(page); + folio_lock(folio); + head = folio_buffers(folio); + if (!head) { + create_empty_buffers(&folio->page, i_blocksize(inode), 0); + head = folio_buffers(folio); + } + folio_unlock(folio); - bh = head = page_buffers(page); + bh = head; do { if (!buffer_dirty(bh) || buffer_async_write(bh)) continue; @@ -718,13 +721,13 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, list_add_tail(&bh->b_assoc_buffers, listp); ndirties++; if (unlikely(ndirties >= nlimit)) { - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); return ndirties; } } while (bh = bh->b_this_page, bh != head); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); goto repeat; } @@ -734,20 +737,19 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, { struct nilfs_inode_info *ii = NILFS_I(inode); struct inode *btnc_inode = ii->i_assoc_inode; - struct pagevec pvec; + struct folio_batch fbatch; struct buffer_head *bh, *head; unsigned int i; pgoff_t index = 0; if (!btnc_inode) return; + folio_batch_init(&fbatch); - pagevec_init(&pvec); - - while (pagevec_lookup_tag(&pvec, btnc_inode->i_mapping, &index, - PAGECACHE_TAG_DIRTY)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - bh = head = page_buffers(pvec.pages[i]); + while (filemap_get_folios_tag(btnc_inode->i_mapping, &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + bh = head = folio_buffers(fbatch.folios[i]); do { if (buffer_dirty(bh) && !buffer_async_write(bh)) { @@ -758,7 +760,7 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, bh = bh->b_this_page; } while (bh != head); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } @@ -1581,7 +1583,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, nblocks = le32_to_cpu(finfo->fi_nblocks); ndatablk = le32_to_cpu(finfo->fi_ndatablk); - inode = bh->b_page->mapping->host; + inode = bh->b_folio->mapping->host; if (mode == SC_LSEG_DSYNC) sc_op = &nilfs_sc_dsync_ops; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 6edb6e0dd61f..1422b8ba24ed 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -409,6 +409,15 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize) goto out; /* + * Prevent underflow in second superblock position calculation. + * The exact minimum size check is done in nilfs_sufile_resize(). + */ + if (newsize < 4096) { + ret = -ENOSPC; + goto out; + } + + /* * Write lock is required to protect some functions depending * on the number of segments, the number of reserved segments, * and so forth. diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2064e6473d30..3a4c9c150cbf 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -544,9 +544,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, { struct nilfs_super_block **sbp = nilfs->ns_sbp; struct buffer_head **sbh = nilfs->ns_sbh; - u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev)); + u64 sb2off, devsize = bdev_nr_bytes(nilfs->ns_bdev); int valid[2], swp = 0; + if (devsize < NILFS_SEG_MIN_BLOCKS * NILFS_MIN_BLOCK_SIZE + 4096) { + nilfs_err(sb, "device size too small"); + return -EINVAL; + } + sb2off = NILFS_SB2_OFFSET_BYTES(devsize); + sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, &sbh[0]); sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]); diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index c020d26ba223..c6c72c90fd25 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config FSNOTIFY def_bool n - select SRCU source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index a2a15bc4df28..29bdd99b29fa 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -262,7 +262,7 @@ static int fanotify_get_response(struct fsnotify_group *group, } /* userspace responded, convert to something usable */ - switch (event->response & ~FAN_AUDIT) { + switch (event->response & FANOTIFY_RESPONSE_ACCESS) { case FAN_ALLOW: ret = 0; break; @@ -273,7 +273,8 @@ static int fanotify_get_response(struct fsnotify_group *group, /* Check if the response should be audited */ if (event->response & FAN_AUDIT) - audit_fanotify(event->response & ~FAN_AUDIT); + audit_fanotify(event->response & ~FAN_AUDIT, + &event->audit_rule); pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, group, event, ret); @@ -563,6 +564,9 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path, pevent->fae.type = FANOTIFY_EVENT_TYPE_PATH_PERM; pevent->response = 0; + pevent->hdr.type = FAN_RESPONSE_INFO_NONE; + pevent->hdr.pad = 0; + pevent->hdr.len = 0; pevent->state = FAN_EVENT_INIT; pevent->path = *path; path_get(path); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 57f51a9a3015..e8a3c28c5d12 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -425,9 +425,13 @@ FANOTIFY_PE(struct fanotify_event *event) struct fanotify_perm_event { struct fanotify_event fae; struct path path; - unsigned short response; /* userspace answer to the event */ + u32 response; /* userspace answer to the event */ unsigned short state; /* state of the event */ int fd; /* fd we passed to userspace for this event */ + union { + struct fanotify_response_info_header hdr; + struct fanotify_response_info_audit_rule audit_rule; + }; }; static inline struct fanotify_perm_event * diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4546da4a54f9..8f430bfad487 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -283,19 +283,42 @@ static int create_fd(struct fsnotify_group *group, const struct path *path, return client_fd; } +static int process_access_response_info(const char __user *info, + size_t info_len, + struct fanotify_response_info_audit_rule *friar) +{ + if (info_len != sizeof(*friar)) + return -EINVAL; + + if (copy_from_user(friar, info, sizeof(*friar))) + return -EFAULT; + + if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE) + return -EINVAL; + if (friar->hdr.pad != 0) + return -EINVAL; + if (friar->hdr.len != sizeof(*friar)) + return -EINVAL; + + return info_len; +} + /* * Finish processing of permission event by setting it to ANSWERED state and * drop group->notification_lock. */ static void finish_permission_event(struct fsnotify_group *group, - struct fanotify_perm_event *event, - unsigned int response) + struct fanotify_perm_event *event, u32 response, + struct fanotify_response_info_audit_rule *friar) __releases(&group->notification_lock) { bool destroy = false; assert_spin_locked(&group->notification_lock); - event->response = response; + event->response = response & ~FAN_INFO; + if (response & FAN_INFO) + memcpy(&event->audit_rule, friar, sizeof(*friar)); + if (event->state == FAN_EVENT_CANCELED) destroy = true; else @@ -306,20 +329,27 @@ static void finish_permission_event(struct fsnotify_group *group, } static int process_access_response(struct fsnotify_group *group, - struct fanotify_response *response_struct) + struct fanotify_response *response_struct, + const char __user *info, + size_t info_len) { struct fanotify_perm_event *event; int fd = response_struct->fd; - int response = response_struct->response; + u32 response = response_struct->response; + int ret = info_len; + struct fanotify_response_info_audit_rule friar; - pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, - fd, response); + pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__, + group, fd, response, info, info_len); /* * make sure the response is valid, if invalid we do nothing and either * userspace can send a valid response or we will clean it up after the * timeout */ - switch (response & ~FAN_AUDIT) { + if (response & ~FANOTIFY_RESPONSE_VALID_MASK) + return -EINVAL; + + switch (response & FANOTIFY_RESPONSE_ACCESS) { case FAN_ALLOW: case FAN_DENY: break; @@ -327,10 +357,20 @@ static int process_access_response(struct fsnotify_group *group, return -EINVAL; } - if (fd < 0) + if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) return -EINVAL; - if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) + if (response & FAN_INFO) { + ret = process_access_response_info(info, info_len, &friar); + if (ret < 0) + return ret; + if (fd == FAN_NOFD) + return ret; + } else { + ret = 0; + } + + if (fd < 0) return -EINVAL; spin_lock(&group->notification_lock); @@ -340,9 +380,9 @@ static int process_access_response(struct fsnotify_group *group, continue; list_del_init(&event->fae.fse.list); - finish_permission_event(group, event, response); + finish_permission_event(group, event, response, &friar); wake_up(&group->fanotify_data.access_waitq); - return 0; + return ret; } spin_unlock(&group->notification_lock); @@ -804,7 +844,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, if (ret <= 0) { spin_lock(&group->notification_lock); finish_permission_event(group, - FANOTIFY_PERM(event), FAN_DENY); + FANOTIFY_PERM(event), FAN_DENY, NULL); wake_up(&group->fanotify_data.access_waitq); } else { spin_lock(&group->notification_lock); @@ -827,28 +867,32 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct fanotify_response response = { .fd = -1, .response = -1 }; + struct fanotify_response response; struct fsnotify_group *group; int ret; + const char __user *info_buf = buf + sizeof(struct fanotify_response); + size_t info_len; if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) return -EINVAL; group = file->private_data; + pr_debug("%s: group=%p count=%zu\n", __func__, group, count); + if (count < sizeof(response)) return -EINVAL; - count = sizeof(response); - - pr_debug("%s: group=%p count=%zu\n", __func__, group, count); - - if (copy_from_user(&response, buf, count)) + if (copy_from_user(&response, buf, sizeof(response))) return -EFAULT; - ret = process_access_response(group, &response); + info_len = count - sizeof(response); + + ret = process_access_response(group, &response, info_buf, info_len); if (ret < 0) count = ret; + else + count = sizeof(response) + ret; return count; } @@ -876,7 +920,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) event = list_first_entry(&group->fanotify_data.access_list, struct fanotify_perm_event, fae.fse.list); list_del_init(&event->fae.fse.list); - finish_permission_event(group, event, FAN_ALLOW); + finish_permission_event(group, event, FAN_ALLOW, NULL); spin_lock(&group->notification_lock); } @@ -893,7 +937,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) fsnotify_destroy_event(group, fsn_event); } else { finish_permission_event(group, FANOTIFY_PERM(event), - FAN_ALLOW); + FAN_ALLOW, NULL); } spin_lock(&group->notification_lock); } diff --git a/fs/nsfs.c b/fs/nsfs.c index 3506f6074288..f8df60b3b901 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -21,6 +21,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, static const struct file_operations ns_file_operations = { .llseek = no_llseek, .unlocked_ioctl = ns_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) @@ -254,7 +255,7 @@ out_invalid: /** * ns_match() - Returns true if current namespace matches dev/ino provided. - * @ns_common: current ns + * @ns: current namespace * @dev: dev_t from nsfs that will be matched against current nsfs * @ino: ino_t from nsfs that will be matched against current nsfs * diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 9364d35b4a10..e8aeba124a95 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * aops.c - NTFS kernel address space operations and page cache handling. * * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. @@ -1646,7 +1646,7 @@ hole: return block; } -/** +/* * ntfs_normal_aops - address space operations for normal inodes and attributes * * Note these are not used for compressed or mst protected inodes and @@ -1664,7 +1664,7 @@ const struct address_space_operations ntfs_normal_aops = { .error_remove_page = generic_error_remove_page, }; -/** +/* * ntfs_compressed_aops - address space operations for compressed inodes */ const struct address_space_operations ntfs_compressed_aops = { @@ -1678,9 +1678,9 @@ const struct address_space_operations ntfs_compressed_aops = { .error_remove_page = generic_error_remove_page, }; -/** +/* * ntfs_mst_aops - general address space operations for mst protecteed inodes - * and attributes + * and attributes */ const struct address_space_operations ntfs_mst_aops = { .read_folio = ntfs_read_folio, /* Fill page with data. */ diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h index 0cac5458c023..8d0958a149cb 100644 --- a/fs/ntfs/aops.h +++ b/fs/ntfs/aops.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/** +/* * aops.h - Defines for NTFS kernel address space operations and page cache * handling. Part of the Linux-NTFS project. * diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index 587e9b187873..f9cb180b6f6b 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * compress.c - NTFS kernel compressed attributes handling. * Part of the Linux-NTFS project. * @@ -41,12 +41,12 @@ typedef enum { NTFS_MAX_CB_SIZE = 64 * 1024, } ntfs_compression_constants; -/** +/* * ntfs_compression_buffer - one buffer for the decompression engine */ static u8 *ntfs_compression_buffer; -/** +/* * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer */ static DEFINE_SPINLOCK(ntfs_cb_lock); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index cd96083a12c8..518c3a21a556 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project. * * Copyright (c) 2001-2007 Anton Altaparmakov @@ -17,7 +17,7 @@ #include "debug.h" #include "ntfs.h" -/** +/* * The little endian Unicode string $I30 as a global constant. */ ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 08c659332e26..6c3f38d66579 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * inode.c - NTFS kernel inode handling. * * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. @@ -2865,7 +2865,7 @@ void ntfs_truncate_vfs(struct inode *vi) { /** * ntfs_setattr - called from notify_change() when an attribute is being changed - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: dentry whose attributes to change * @attr: structure describing the attributes and the changes * @@ -2878,14 +2878,14 @@ void ntfs_truncate_vfs(struct inode *vi) { * * Called with ->i_mutex held. */ -int ntfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *vi = d_inode(dentry); int err; unsigned int ia_valid = attr->ia_valid; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) goto out; /* We do not support NTFS ACLs yet. */ @@ -2935,7 +2935,7 @@ out: } /** - * ntfs_write_inode - write out a dirty inode + * __ntfs_write_inode - write out a dirty inode * @vi: inode to write out * @sync: if true, write out synchronously * @@ -3033,7 +3033,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) * might not need to be written out. * NOTE: It is not a problem when the inode for $MFT itself is being * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES - * on the $MFT inode and hence ntfs_write_inode() will not be + * on the $MFT inode and hence __ntfs_write_inode() will not be * re-invoked because of it which in turn is ok since the dirtied mft * record will be cleaned and written out to disk below, i.e. before * this function returns. diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 6f78ee00f57f..147ef4ddb691 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -289,7 +289,7 @@ extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); extern int ntfs_truncate(struct inode *vi); extern void ntfs_truncate_vfs(struct inode *vi); -extern int ntfs_setattr(struct user_namespace *mnt_userns, +extern int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern int __ntfs_write_inode(struct inode *vi, int sync); diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index f7bf5ce960cc..48030899dc6e 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. * * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 4e6a44bc654c..ab44f2db533b 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -259,7 +259,7 @@ err_out: } } -/** +/* * Inode operations for directories. */ const struct inode_operations ntfs_dir_inode_ops = { @@ -364,7 +364,7 @@ static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, ntfs_nfs_get_inode); } -/** +/* * Export operations allowing NFS exporting of mounted NTFS partitions. * * We use the default ->encode_fh() for now. Note that they diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c index 97932fb5179c..0d448e9881f7 100644 --- a/fs/ntfs/runlist.c +++ b/fs/ntfs/runlist.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. * * Copyright (c) 2001-2007 Anton Altaparmakov diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 001f4e053c85..2643a08182e1 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -58,9 +58,17 @@ const option_t on_errors_arr[] = { }; /** - * simple_getbool - + * simple_getbool - convert input string to a boolean value + * @s: input string to convert + * @setval: where to store the output boolean value * * Copied from old ntfs driver (which copied from vfat driver). + * + * "1", "yes", "true", or an empty string are converted to %true. + * "0", "no", and "false" are converted to %false. + * + * Return: %1 if the string is converted or was empty and *setval contains it; + * %0 if the string was not valid. */ static int simple_getbool(char *s, bool *setval) { @@ -2657,7 +2665,7 @@ static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) } #endif -/** +/* * The complete super operations. */ static const struct super_operations ntfs_sops = { diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig index 6e4cbc48ab8e..96cc236f7f7b 100644 --- a/fs/ntfs3/Kconfig +++ b/fs/ntfs3/Kconfig @@ -2,6 +2,7 @@ config NTFS3_FS tristate "NTFS Read-Write file system support" select NLS + select LEGACY_DIRECT_IO help Windows OS native file system (NTFS) support up to NTFS version 3.1. diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index d294cd975688..e9bdc1ff08c9 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -70,7 +70,7 @@ static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) /* * ntfs_getattr - inode_operations::getattr */ -int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags) { struct inode *inode = d_inode(path->dentry); @@ -84,7 +84,7 @@ int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->attributes_mask |= STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED; - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); stat->result_mask |= STATX_BTIME; stat->btime = ni->i_crtime; @@ -657,7 +657,7 @@ out: /* * ntfs3_setattr - inode_operations::setattr */ -int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct super_block *sb = dentry->d_sb; @@ -676,7 +676,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ia_valid = attr->ia_valid; } - err = setattr_prepare(mnt_userns, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) goto out; @@ -704,10 +704,10 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode->i_size = newsize; } - setattr_copy(mnt_userns, inode, attr); + setattr_copy(idmap, inode, attr); if (mode != inode->i_mode) { - err = ntfs_acl_chmod(mnt_userns, dentry); + err = ntfs_acl_chmod(idmap, dentry); if (err) goto out; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 20b953871574..309d9b46b5d5 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -832,32 +832,29 @@ out: return err; } -static int ntfs_writepage(struct page *page, struct writeback_control *wbc) +static int ntfs_resident_writepage(struct folio *folio, + struct writeback_control *wbc, void *data) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct ntfs_inode *ni = ntfs_i(inode); - int err; + struct address_space *mapping = data; + struct ntfs_inode *ni = ntfs_i(mapping->host); + int ret; - if (is_resident(ni)) { - ni_lock(ni); - err = attr_data_write_resident(ni, page); - ni_unlock(ni); - if (err != E_NTFS_NONRESIDENT) { - unlock_page(page); - return err; - } - } + ni_lock(ni); + ret = attr_data_write_resident(ni, &folio->page); + ni_unlock(ni); - return block_write_full_page(page, ntfs_get_block, wbc); + if (ret != E_NTFS_NONRESIDENT) + folio_unlock(folio); + mapping_set_error(mapping, ret); + return ret; } static int ntfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - /* Redirect call to 'ntfs_writepage' for resident files. */ if (is_resident(ntfs_i(mapping->host))) - return generic_writepages(mapping, wbc); + return write_cache_pages(mapping, wbc, ntfs_resident_writepage, + mapping); return mpage_writepages(mapping, wbc, ntfs_get_block); } @@ -1185,7 +1182,7 @@ out: * * NOTE: if fnd != NULL (ntfs_atomic_open) then @dir is locked */ -struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, +struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const struct cpu_str *uni, umode_t mode, dev_t dev, const char *symname, u32 size, @@ -1307,7 +1304,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, goto out3; } inode = &ni->vfs_inode; - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); mode = inode->i_mode; inode->i_atime = inode->i_mtime = inode->i_ctime = ni->i_crtime = @@ -1614,7 +1611,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, #ifdef CONFIG_NTFS3_FS_POSIX_ACL if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) { - err = ntfs_init_acl(mnt_userns, inode, dir); + err = ntfs_init_acl(idmap, inode, dir); if (err) goto out7; } else @@ -2066,13 +2063,13 @@ const struct inode_operations ntfs_link_inode_operations = { const struct address_space_operations ntfs_aops = { .read_folio = ntfs_read_folio, .readahead = ntfs_readahead, - .writepage = ntfs_writepage, .writepages = ntfs_writepages, .write_begin = ntfs_write_begin, .write_end = ntfs_write_end, .direct_IO = ntfs_direct_IO, .bmap = ntfs_bmap, .dirty_folio = block_dirty_folio, + .migrate_folio = buffer_migrate_folio, .invalidate_folio = block_invalidate_folio, }; diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index c8db35e2ae17..407fe92394e2 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -94,12 +94,12 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, /* * ntfs_create - inode_operations::create */ -static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode, + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; @@ -110,12 +110,12 @@ static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, * * inode_operations::mknod */ -static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev, + inode = ntfs_create_inode(idmap, dir, dentry, NULL, mode, rdev, NULL, 0, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; @@ -183,13 +183,13 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry) /* * ntfs_symlink - inode_operations::symlink */ -static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { u32 size = strlen(symname); struct inode *inode; - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777, + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0, symname, size, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; @@ -198,12 +198,12 @@ static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, /* * ntfs_mkdir- inode_operations::mkdir */ -static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode, + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0, NULL, 0, NULL); return IS_ERR(inode) ? PTR_ERR(inode) : 0; @@ -229,7 +229,7 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry) /* * ntfs_rename - inode_operations::rename */ -static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir, +static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode *new_dir, struct dentry *new_dentry, u32 flags) { @@ -415,13 +415,13 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, /* * Unfortunately I don't know how to get here correct 'struct nameidata *nd' - * or 'struct user_namespace *mnt_userns'. + * or 'struct mnt_idmap *idmap'. * See atomic_open in fs/namei.c. * This is why xfstest/633 failed. - * Looks like ntfs_atomic_open must accept 'struct user_namespace *mnt_userns' as argument. + * Looks like ntfs_atomic_open must accept 'struct mnt_idmap *idmap' as argument. */ - inode = ntfs_create_inode(&init_user_ns, dir, dentry, uni, mode, 0, + inode = ntfs_create_inode(&nop_mnt_idmap, dir, dentry, uni, mode, 0, NULL, 0, fnd); err = IS_ERR(inode) ? PTR_ERR(inode) : finish_open(file, dentry, ntfs_file_open); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 0e051c5595a2..80072e5f96f7 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -492,10 +492,12 @@ bool dir_is_empty(struct inode *dir); extern const struct file_operations ntfs_dir_operations; /* Globals from file.c */ -int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); -int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); +void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, + CLST len); int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); @@ -706,7 +708,7 @@ int ntfs_sync_inode(struct inode *inode); int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); int inode_write_data(struct inode *inode, const void *data, size_t bytes); -struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, +struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const struct cpu_str *uni, umode_t mode, dev_t dev, const char *symname, u32 size, @@ -857,17 +859,17 @@ unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase, /* globals from xattr.c */ #ifdef CONFIG_NTFS3_FS_POSIX_ACL struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu); -int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); -int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, struct inode *dir); #else #define ntfs_get_acl NULL #define ntfs_set_acl NULL #endif -int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry); -int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); +int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); extern const struct xattr_handler *ntfs_xattr_handlers[]; diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 616df209feea..ff64302e87e5 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -578,7 +578,7 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) return ntfs_get_acl_ex(inode, type, 0); } -static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, +static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, struct inode *inode, struct posix_acl *acl, int type, bool init_acl) { @@ -597,7 +597,7 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, case ACL_TYPE_ACCESS: /* Do not change i_mode if we are in init_acl */ if (acl && !init_acl) { - err = posix_acl_update_mode(mnt_userns, inode, &mode, + err = posix_acl_update_mode(idmap, inode, &mode, &acl); if (err) return err; @@ -652,10 +652,10 @@ out: /* * ntfs_set_acl - inode_operations::set_acl */ -int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { - return ntfs_set_acl_ex(mnt_userns, d_inode(dentry), acl, type, false); + return ntfs_set_acl_ex(idmap, d_inode(dentry), acl, type, false); } /* @@ -663,7 +663,7 @@ int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * * Called from ntfs_create_inode(). */ -int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, struct inode *dir) { struct posix_acl *default_acl, *acl; @@ -674,7 +674,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, return err; if (default_acl) { - err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, + err = ntfs_set_acl_ex(idmap, inode, default_acl, ACL_TYPE_DEFAULT, true); posix_acl_release(default_acl); } else { @@ -683,7 +683,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, if (acl) { if (!err) - err = ntfs_set_acl_ex(mnt_userns, inode, acl, + err = ntfs_set_acl_ex(idmap, inode, acl, ACL_TYPE_ACCESS, true); posix_acl_release(acl); } else { @@ -697,7 +697,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, /* * ntfs_acl_chmod - Helper for ntfs3_setattr(). */ -int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry) +int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; @@ -708,13 +708,13 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry) if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - return posix_acl_chmod(mnt_userns, dentry, inode->i_mode); + return posix_acl_chmod(idmap, dentry, inode->i_mode); } /* * ntfs_permission - inode_operations::permission */ -int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (ntfs_sb(inode->i_sb)->options->noacsrules) { @@ -722,7 +722,7 @@ int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, return 0; } - return generic_permission(mnt_userns, inode, mask); + return generic_permission(idmap, inode, mask); } /* @@ -835,7 +835,7 @@ out: * ntfs_setxattr - inode_operations::setxattr */ static noinline int ntfs_setxattr(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *de, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 5d11380d8724..304d12186ccd 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -7,6 +7,7 @@ config OCFS2_FS select QUOTA select QUOTA_TREE select FS_POSIX_ACL + select LEGACY_DIRECT_IO help OCFS2 is a general purpose extent based shared disk cluster file system with many similarities to ext3. It supports 64 bit inode diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 9f19cf9a5a9f..9fd03eaf15f8 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -260,7 +260,7 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; @@ -274,7 +274,7 @@ int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (type == ACL_TYPE_ACCESS && acl) { umode_t mode; - status = posix_acl_update_mode(&init_user_ns, inode, &mode, + status = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (status) goto unlock; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index a897c4e41b26..667c6f03fa60 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -17,7 +17,7 @@ struct ocfs2_acl_entry { }; struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu); -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index a07b24d170f2..aecbd712a00c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -46,6 +46,7 @@ #include <linux/net.h> #include <linux/export.h> #include <net/tcp.h> +#include <trace/events/sock.h> #include <linux/uaccess.h> @@ -585,6 +586,8 @@ static void o2net_data_ready(struct sock *sk) void (*ready)(struct sock *sk); struct o2net_sock_container *sc; + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); sc = sk->sk_user_data; if (sc) { @@ -1931,6 +1934,8 @@ static void o2net_listen_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); ready = sk->sk_user_data; if (ready == NULL) { /* check for teardown race */ diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 8b2020f92b5f..ba26c5567cff 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -188,18 +188,18 @@ static int dlmfs_file_release(struct inode *inode, * We do ->setattr() just to override size changes. Our size is the size * of the LVB and nothing else. */ -static int dlmfs_file_setattr(struct user_namespace *mnt_userns, +static int dlmfs_file_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); attr->ia_valid &= ~ATTR_SIZE; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -336,7 +336,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, NULL, mode); + inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inc_nlink(inode); @@ -359,7 +359,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, return NULL; inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, parent, mode); + inode_init_owner(&nop_mnt_idmap, inode, parent, mode); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); ip = DLMFS_I(inode); @@ -402,7 +402,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, * File creation. Allocate an inode, and we're done.. */ /* SMP-safe */ -static int dlmfs_mkdir(struct user_namespace * mnt_userns, +static int dlmfs_mkdir(struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode) @@ -451,7 +451,7 @@ bail: return status; } -static int dlmfs_create(struct user_namespace *mnt_userns, +static int dlmfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5c60b6bc85bf..efb09de4343d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1111,7 +1111,7 @@ out: return ret; } -int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int status = 0, size_change; @@ -1142,11 +1142,11 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) return 0; - status = setattr_prepare(&init_user_ns, dentry, attr); + status = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (status) return status; - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, attr)) { status = dquot_initialize(inode); if (status) return status; @@ -1265,7 +1265,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); @@ -1302,7 +1302,7 @@ bail: return status; } -int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); @@ -1317,7 +1317,7 @@ int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, goto bail; } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). @@ -1334,7 +1334,7 @@ bail: return err; } -int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ocfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret, had_lock; @@ -1360,7 +1360,7 @@ int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, dump_stack(); } - ret = generic_permission(&init_user_ns, inode, mask); + ret = generic_permission(&nop_mnt_idmap, inode, mask); ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); out: @@ -1991,7 +1991,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } } - if (file && setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { + if (file && setattr_should_drop_suidgid(&nop_mnt_idmap, file_inode(file))) { ret = __ocfs2_write_remove_suid(inode, di_bh); if (ret) { mlog_errno(ret); @@ -2279,7 +2279,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ - if (setattr_should_drop_suidgid(&init_user_ns, inode)) { + if (setattr_should_drop_suidgid(&nop_mnt_idmap, inode)) { if (meta_level == 0) { ocfs2_inode_unlock_for_extent_tree(inode, &di_bh, diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 71db8f3aa027..8e53e4ac1120 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -49,11 +49,11 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); -int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int ocfs2_permission(struct user_namespace *mnt_userns, +int ocfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index afd54ec66103..811a6ea374bb 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -82,7 +82,7 @@ int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) return status; } -int ocfs2_fileattr_set(struct user_namespace *mnt_userns, +int ocfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h index 0297c8846945..48a5fdfe87a1 100644 --- a/fs/ocfs2/ioctl.h +++ b/fs/ocfs2/ioctl.h @@ -12,7 +12,7 @@ #define OCFS2_IOCTL_PROTO_H int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ocfs2_fileattr_set(struct user_namespace *mnt_userns, +int ocfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 3fb98b4569a2..25d8072ccfce 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -15,6 +15,7 @@ #include <linux/time.h> #include <linux/random.h> #include <linux/delay.h> +#include <linux/writeback.h> #include <cluster/masklog.h> @@ -841,6 +842,19 @@ bail: return status; } +static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = mapping->nrpages * 2, + .range_start = jinode->i_dirty_start, + .range_end = jinode->i_dirty_end, + }; + + return filemap_fdatawrite_wbc(mapping, &wbc); +} + int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) { int status = -1; @@ -910,7 +924,7 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) journal->j_journal = j_journal; journal->j_journal->j_submit_inode_data_buffers = - jbd2_journal_submit_inode_data_buffers; + ocfs2_journal_submit_inode_data_buffers; journal->j_journal->j_finish_inode_data_buffers = jbd2_journal_finish_inode_data_buffers; journal->j_inode = inode; diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 73a3854b2afb..f37174e79fad 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -8,6 +8,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/fcntl.h> #include <cluster/masklog.h> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a8fd51afb794..9175dbc47201 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -197,8 +197,8 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) * callers. */ if (S_ISDIR(mode)) set_nlink(inode, 2); - mode = mode_strip_sgid(&init_user_ns, dir, mode); - inode_init_owner(&init_user_ns, inode, dir, mode); + mode = mode_strip_sgid(&nop_mnt_idmap, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); status = dquot_initialize(inode); if (status) return ERR_PTR(status); @@ -221,7 +221,7 @@ static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, iput(inode); } -static int ocfs2_mknod(struct user_namespace *mnt_userns, +static int ocfs2_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -642,7 +642,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe_blkno, suballoc_loc, suballoc_bit); } -static int ocfs2_mkdir(struct user_namespace *mnt_userns, +static int ocfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) @@ -651,14 +651,14 @@ static int ocfs2_mkdir(struct user_namespace *mnt_userns, trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name, OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); + ret = ocfs2_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (ret) mlog_errno(ret); return ret; } -static int ocfs2_create(struct user_namespace *mnt_userns, +static int ocfs2_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -668,7 +668,7 @@ static int ocfs2_create(struct user_namespace *mnt_userns, trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + ret = ocfs2_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); if (ret) mlog_errno(ret); @@ -1194,7 +1194,7 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) ocfs2_inode_unlock(inode2, 1); } -static int ocfs2_rename(struct user_namespace *mnt_userns, +static int ocfs2_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, @@ -1784,7 +1784,7 @@ bail: return status; } -static int ocfs2_symlink(struct user_namespace *mnt_userns, +static int ocfs2_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 623db358b1ef..5a656dc683f1 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4316,7 +4316,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); + return inode_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); } /** @@ -4370,7 +4370,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, * file. */ if (!preserve) { - error = inode_permission(&init_user_ns, inode, MAY_READ); + error = inode_permission(&nop_mnt_idmap, inode, MAY_READ); if (error) return error; } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 64e6ddcfe329..05d4414d0c33 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/slab.h> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 95d0611c5fc7..389308efe854 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7247,7 +7247,7 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler, } static int ocfs2_xattr_security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -7320,7 +7320,7 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, } static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -7351,7 +7351,7 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler, } static int ocfs2_xattr_user_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index c219f91f44e9..82cf7e9a665f 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -279,13 +279,13 @@ out_free_inode: return err; } -static int omfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return omfs_add_node(dir, dentry, mode | S_IFDIR); } -static int omfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int omfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return omfs_add_node(dir, dentry, mode | S_IFREG); @@ -370,7 +370,7 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx, return true; } -static int omfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int omfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 3a5b4b88a583..0101f1f87b56 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -337,13 +337,13 @@ const struct file_operations omfs_file_operations = { .splice_read = generic_file_splice_read, }; -static int omfs_setattr(struct user_namespace *mnt_userns, +static int omfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -356,7 +356,7 @@ static int omfs_setattr(struct user_namespace *mnt_userns, omfs_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 2a0e83236c01..c4c79e07efc7 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -48,7 +48,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode) goto fail; inode->i_ino = new_block; - inode_init_owner(&init_user_ns, inode, NULL, mode); + inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); diff --git a/fs/open.c b/fs/open.c index 82c1a28b3308..4401a73d4032 100644 --- a/fs/open.c +++ b/fs/open.c @@ -33,10 +33,11 @@ #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> +#include <linux/filelock.h> #include "internal.h" -int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, +int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { int ret; @@ -54,7 +55,7 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Remove suid, sgid, and file capabilities on truncate too */ - ret = dentry_needs_remove_privs(mnt_userns, dentry); + ret = dentry_needs_remove_privs(idmap, dentry); if (ret < 0) return ret; if (ret) @@ -62,14 +63,14 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ - ret = notify_change(mnt_userns, dentry, &newattrs, NULL); + ret = notify_change(idmap, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } long vfs_truncate(const struct path *path, loff_t length) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode; long error; @@ -85,8 +86,8 @@ long vfs_truncate(const struct path *path, loff_t length) if (error) goto out; - mnt_userns = mnt_user_ns(path->mnt); - error = inode_permission(mnt_userns, inode, MAY_WRITE); + idmap = mnt_idmap(path->mnt); + error = inode_permission(idmap, inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; @@ -108,7 +109,7 @@ long vfs_truncate(const struct path *path, loff_t length) error = security_path_truncate(path); if (!error) - error = do_truncate(mnt_userns, path->dentry, length, 0, NULL); + error = do_truncate(idmap, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); @@ -190,7 +191,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) sb_start_write(inode->i_sb); error = security_file_truncate(f.file); if (!error) - error = do_truncate(file_mnt_user_ns(f.file), dentry, length, + error = do_truncate(file_mnt_idmap(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); sb_end_write(inode->i_sb); out_putf: @@ -367,7 +368,37 @@ COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. + * + * Creating new credentials is expensive, so we try to skip doing it, + * which we can if the result would match what we already got. */ +static bool access_need_override_creds(int flags) +{ + const struct cred *cred; + + if (flags & AT_EACCESS) + return false; + + cred = current_cred(); + if (!uid_eq(cred->fsuid, cred->uid) || + !gid_eq(cred->fsgid, cred->gid)) + return true; + + if (!issecure(SECURE_NO_SETUID_FIXUP)) { + kuid_t root_uid = make_kuid(cred->user_ns, 0); + if (!uid_eq(cred->uid, root_uid)) { + if (!cap_isclear(cred->cap_effective)) + return true; + } else { + if (!cap_isidentical(cred->cap_effective, + cred->cap_permitted)) + return true; + } + } + + return false; +} + static const struct cred *access_override_creds(void) { const struct cred *old_cred; @@ -377,6 +408,12 @@ static const struct cred *access_override_creds(void) if (!override_cred) return NULL; + /* + * XXX access_need_override_creds performs checks in hopes of skipping + * this work. Make sure it stays in sync if making any changes in this + * routine. + */ + override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; @@ -436,7 +473,7 @@ static long do_faccessat(int dfd, const char __user *filename, int mode, int fla if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; - if (!(flags & AT_EACCESS)) { + if (access_need_override_creds(flags)) { old_cred = access_override_creds(); if (!old_cred) return -ENOMEM; @@ -459,7 +496,7 @@ retry: goto out_path_release; } - res = inode_permission(mnt_user_ns(path.mnt), inode, mode | MAY_ACCESS); + res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; @@ -603,7 +640,7 @@ retry_deleg: goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(mnt_user_ns(path->mnt), path->dentry, + error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); @@ -701,7 +738,8 @@ static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) int chown_common(const struct path *path, uid_t user, gid_t group) { - struct user_namespace *mnt_userns, *fs_userns; + struct mnt_idmap *idmap; + struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; @@ -712,7 +750,7 @@ int chown_common(const struct path *path, uid_t user, gid_t group) uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); - mnt_userns = mnt_user_ns(path->mnt); + idmap = mnt_idmap(path->mnt); fs_userns = i_user_ns(inode); retry_deleg: @@ -726,14 +764,14 @@ retry_deleg: inode_lock(inode); if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | - setattr_should_drop_sgid(mnt_userns, inode); + setattr_should_drop_sgid(idmap, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, - from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid), - from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid)); + from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), + from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); if (!error) - error = notify_change(mnt_userns, path->dentry, &newattrs, + error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { @@ -870,7 +908,7 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; - error = break_lease(locks_inode(f), f->f_flags); + error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; @@ -1064,7 +1102,7 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, if (IS_ERR(f)) return f; - error = vfs_create(mnt_user_ns(path->mnt), + error = vfs_create(mnt_idmap(path->mnt), d_inode(path->dentry->d_parent), path->dentry, mode, true); if (!error) @@ -1411,8 +1449,9 @@ int filp_close(struct file *filp, fl_owner_t id) { int retval = 0; - if (!file_count(filp)) { - printk(KERN_ERR "VFS: Close: file count is 0\n"); + if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, + "VFS: Close: file count is 0 (f_op=%ps)", + filp->f_op)) { return 0; } diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index c5da2091cefb..5aefb705bcc8 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -118,7 +118,7 @@ out: return error; } -int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int orangefs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error; @@ -136,7 +136,7 @@ int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * and "mode" to the new desired value. It is up to * us to propagate the new mode back to the server... */ - error = posix_acl_update_mode(&init_user_ns, inode, + error = posix_acl_update_mode(&nop_mnt_idmap, inode, &iattr.ia_mode, &acl); if (error) { gossip_err("%s: posix_acl_update_mode err: %d\n", diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 167fa43b24f9..1a4301a38aa7 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -14,6 +14,7 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/pagemap.h> static int flush_racache(struct inode *inode) @@ -389,8 +390,7 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) "orangefs_file_mmap: called on %pD\n", file); /* set the sequential readahead hint */ - vma->vm_flags |= VM_SEQ_READ; - vma->vm_flags &= ~VM_RAND_READ; + vm_flags_mod(vma, VM_SEQ_READ, VM_RAND_READ); file_accessed(file); vma->vm_ops = &orangefs_file_vm_ops; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 4df560894386..aefdf1d3be7c 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -49,10 +49,8 @@ static int orangefs_writepage_locked(struct page *page, /* Should've been handled in orangefs_invalidate_folio. */ WARN_ON(off == len || off + wlen > len); - bv.bv_page = page; - bv.bv_len = wlen; - bv.bv_offset = off % PAGE_SIZE; WARN_ON(wlen == 0); + bvec_set_page(&bv, page, wlen, off % PAGE_SIZE); iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, @@ -102,15 +100,11 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, for (i = 0; i < ow->npages; i++) { set_page_writeback(ow->pages[i]); - ow->bv[i].bv_page = ow->pages[i]; - ow->bv[i].bv_len = min(page_offset(ow->pages[i]) + PAGE_SIZE, - ow->off + ow->len) - - max(ow->off, page_offset(ow->pages[i])); - if (i == 0) - ow->bv[i].bv_offset = ow->off - - page_offset(ow->pages[i]); - else - ow->bv[i].bv_offset = 0; + bvec_set_page(&ow->bv[i], ow->pages[i], + min(page_offset(ow->pages[i]) + PAGE_SIZE, + ow->off + ow->len) - + max(ow->off, page_offset(ow->pages[i])), + i == 0 ? ow->off - page_offset(ow->pages[i]) : 0); } iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len); @@ -154,21 +148,20 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, return ret; } -static int orangefs_writepages_callback(struct page *page, - struct writeback_control *wbc, void *data) +static int orangefs_writepages_callback(struct folio *folio, + struct writeback_control *wbc, void *data) { struct orangefs_writepages *ow = data; - struct orangefs_write_range *wr; + struct orangefs_write_range *wr = folio->private; int ret; - if (!PagePrivate(page)) { - unlock_page(page); + if (!wr) { + folio_unlock(folio); /* It's not private so there's nothing to write, right? */ printk("writepages_callback not private!\n"); BUG(); return 0; } - wr = (struct orangefs_write_range *)page_private(page); ret = -1; if (ow->npages == 0) { @@ -176,7 +169,7 @@ static int orangefs_writepages_callback(struct page *page, ow->len = wr->len; ow->uid = wr->uid; ow->gid = wr->gid; - ow->pages[ow->npages++] = page; + ow->pages[ow->npages++] = &folio->page; ret = 0; goto done; } @@ -188,7 +181,7 @@ static int orangefs_writepages_callback(struct page *page, } if (ow->off + ow->len == wr->pos) { ow->len += wr->len; - ow->pages[ow->npages++] = page; + ow->pages[ow->npages++] = &folio->page; ret = 0; goto done; } @@ -198,10 +191,10 @@ done: orangefs_writepages_work(ow, wbc); ow->npages = 0; } - ret = orangefs_writepage_locked(page, wbc); - mapping_set_error(page->mapping, ret); - unlock_page(page); - end_page_writeback(page); + ret = orangefs_writepage_locked(&folio->page, wbc); + mapping_set_error(folio->mapping, ret); + folio_unlock(folio); + folio_end_writeback(folio); } else { if (ow->npages == ow->maxpages) { orangefs_writepages_work(ow, wbc); @@ -300,9 +293,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) orangefs_launder_folio(folio); off = folio_pos(folio); - bv.bv_page = &folio->page; - bv.bv_len = folio_size(folio); - bv.bv_offset = 0; + bvec_set_folio(&bv, folio, folio_size(folio), 0); iov_iter_bvec(&iter, ITER_DEST, &bv, 1, folio_size(folio)); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, @@ -822,7 +813,7 @@ again: ORANGEFS_I(inode)->attr_uid = current_fsuid(); ORANGEFS_I(inode)->attr_gid = current_fsgid(); } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); spin_unlock(&inode->i_lock); mark_inode_dirty(inode); @@ -839,20 +830,20 @@ int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr) ret = __orangefs_setattr(inode, iattr); /* change mode on a file that has ACLs */ if (!ret && (iattr->ia_valid & ATTR_MODE)) - ret = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + ret = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); return ret; } /* * Change attributes of an object referenced by dentry. */ -int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int orangefs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int ret; gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n", dentry); - ret = setattr_prepare(&init_user_ns, dentry, iattr); + ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (ret) goto out; ret = __orangefs_setattr_mode(dentry, iattr); @@ -866,7 +857,7 @@ out: /* * Obtain attributes of an object given a dentry */ -int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { int ret; @@ -879,7 +870,7 @@ int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, ret = orangefs_inode_getattr(inode, request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); if (ret == 0) { - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); /* override block size reported to stat */ if (!(request_mask & STATX_SIZE)) @@ -890,7 +881,7 @@ int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, return ret; } -int orangefs_permission(struct user_namespace *mnt_userns, +int orangefs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; @@ -905,7 +896,7 @@ int orangefs_permission(struct user_namespace *mnt_userns, if (ret < 0) return ret; - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags) @@ -944,7 +935,7 @@ static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -static int orangefs_fileattr_set(struct user_namespace *mnt_userns, +static int orangefs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { u64 val = 0; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 75c1a3dcf68c..77518e248cf7 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -15,7 +15,7 @@ /* * Get a newly allocated inode to go with a negative dentry. */ -static int orangefs_create(struct user_namespace *mnt_userns, +static int orangefs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -216,7 +216,7 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) return ret; } -static int orangefs_symlink(struct user_namespace *mnt_userns, +static int orangefs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) @@ -305,7 +305,7 @@ out: return ret; } -static int orangefs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct orangefs_inode_s *parent = ORANGEFS_I(dir); @@ -375,7 +375,7 @@ out: return ret; } -static int orangefs_rename(struct user_namespace *mnt_userns, +static int orangefs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 6e0cc01b3a14..ce20d3443869 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -106,7 +106,7 @@ enum orangefs_vfs_op_states { extern const struct xattr_handler *orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); -extern int orangefs_set_acl(struct user_namespace *mnt_userns, +extern int orangefs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type); @@ -362,12 +362,12 @@ struct inode *orangefs_new_inode(struct super_block *sb, int __orangefs_setattr(struct inode *, struct iattr *); int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr); -int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *); +int orangefs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); -int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int orangefs_permission(struct user_namespace *mnt_userns, +int orangefs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int orangefs_update_time(struct inode *, struct timespec64 *, int); diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 9a5b757fbd2f..6ecad4f94ae6 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -526,7 +526,7 @@ out_unlock: } static int orangefs_xattr_set_default(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 6e4e65ee050d..c14e90764e35 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -792,7 +792,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (!c->metacopy && c->stat.size) { err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size); if (err) - return err; + goto out_fput; } err = ovl_copy_up_metadata(c, temp); @@ -1011,6 +1011,10 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, if (err) return err; + if (!kuid_has_mapping(current_user_ns(), ctx.stat.uid) || + !kgid_has_mapping(current_user_ns(), ctx.stat.gid)) + return -EOVERFLOW; + ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags); if (parent) { diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index f61e37f4c8ff..fc25fb95d5fc 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -641,7 +641,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, inode->i_state |= I_CREATING; spin_unlock(&inode->i_lock); - inode_init_owner(&init_user_ns, inode, dentry->d_parent->d_inode, mode); + inode_init_owner(&nop_mnt_idmap, inode, dentry->d_parent->d_inode, mode); attr.mode = inode->i_mode; err = ovl_create_or_link(dentry, inode, &attr, false); @@ -655,19 +655,19 @@ out: return err; } -static int ovl_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ovl_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); } -static int ovl_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); } -static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ovl_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { /* Don't allow creation of "whiteout" on overlay */ @@ -677,7 +677,7 @@ static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir, return ovl_create_object(dentry, mode, rdev, NULL); } -static int ovl_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ovl_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *link) { return ovl_create_object(dentry, S_IFLNK, 0, link); @@ -1075,7 +1075,7 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) return err; } -static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, +static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *old, struct inode *newdir, struct dentry *new, unsigned int flags) { diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index a25bb3453dde..defd4e231ad2 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -392,8 +392,8 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, */ take_dentry_name_snapshot(&name, real); /* - * No mnt_userns handling here: it's an internal lookup. Could skip - * permission checking altogether, but for now just use non-mnt_userns + * No idmap handling here: it's an internal lookup. Could skip + * permission checking altogether, but for now just use non-idmap * transformed ids. */ this = lookup_one_len(name.name.name, connected, name.name.len); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index c9d0c362c7ef..7c04f033aadd 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -42,7 +42,7 @@ static struct file *ovl_open_realfile(const struct file *file, { struct inode *realinode = d_inode(realpath->dentry); struct inode *inode = file_inode(file); - struct user_namespace *real_mnt_userns; + struct mnt_idmap *real_idmap; struct file *realfile; const struct cred *old_cred; int flags = file->f_flags | OVL_OPEN_FLAGS; @@ -53,12 +53,12 @@ static struct file *ovl_open_realfile(const struct file *file, acc_mode |= MAY_APPEND; old_cred = ovl_override_creds(inode->i_sb); - real_mnt_userns = mnt_user_ns(realpath->mnt); - err = inode_permission(real_mnt_userns, realinode, MAY_OPEN | acc_mode); + real_idmap = mnt_idmap(realpath->mnt); + err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode); if (err) { realfile = ERR_PTR(err); } else { - if (!inode_owner_or_capable(real_mnt_userns, realinode)) + if (!inode_owner_or_capable(real_idmap, realinode)) flags &= ~O_NOATIME; realfile = open_with_fake_path(&file->f_path, flags, realinode, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index ee6dfa577c93..541cf3717fc2 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -19,7 +19,7 @@ #include "overlayfs.h" -int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int err; @@ -28,7 +28,7 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct dentry *upperdentry; const struct cred *old_cred; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; @@ -153,7 +153,7 @@ static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) } } -int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -278,7 +278,7 @@ out: return err; } -int ovl_permission(struct user_namespace *mnt_userns, +int ovl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct inode *upperinode = ovl_inode_upper(inode); @@ -298,7 +298,7 @@ int ovl_permission(struct user_namespace *mnt_userns, * Check overlay inode with the creds of task and underlying inode * with creds of mounter */ - err = generic_permission(&init_user_ns, inode, mask); + err = generic_permission(&nop_mnt_idmap, inode, mask); if (err) return err; @@ -310,7 +310,7 @@ int ovl_permission(struct user_namespace *mnt_userns, /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } - err = inode_permission(mnt_user_ns(realpath.mnt), realinode, mask); + err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask); revert_creds(old_cred); return err; @@ -361,7 +361,7 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, if (!value && !upperdentry) { ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getxattr(mnt_user_ns(realpath.mnt), realdentry, name, NULL, 0); + err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); revert_creds(old_cred); if (err < 0) goto out_drop_write; @@ -403,7 +403,7 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ovl_i_path_real(inode, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_getxattr(mnt_user_ns(realpath.mnt), realpath.dentry, name, value, size); + res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); revert_creds(old_cred); return res; } @@ -463,7 +463,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) * alter the POSIX ACLs for the underlying filesystem. */ static void ovl_idmap_posix_acl(const struct inode *realinode, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct posix_acl *acl) { struct user_namespace *fs_userns = i_user_ns(realinode); @@ -475,11 +475,11 @@ static void ovl_idmap_posix_acl(const struct inode *realinode, struct posix_acl_entry *e = &acl->a_entries[i]; switch (e->e_tag) { case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, fs_userns, e->e_uid); + vfsuid = make_vfsuid(idmap, fs_userns, e->e_uid); e->e_uid = vfsuid_into_kuid(vfsuid); break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, fs_userns, e->e_gid); + vfsgid = make_vfsgid(idmap, fs_userns, e->e_gid); e->e_gid = vfsgid_into_kgid(vfsgid); break; } @@ -514,15 +514,15 @@ struct posix_acl *ovl_get_acl_path(const struct path *path, const char *acl_name, bool noperm) { struct posix_acl *real_acl, *clone; - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *realinode = d_inode(path->dentry); - mnt_userns = mnt_user_ns(path->mnt); + idmap = mnt_idmap(path->mnt); if (noperm) real_acl = get_inode_acl(realinode, posix_acl_type(acl_name)); else - real_acl = vfs_get_acl(mnt_userns, path->dentry, acl_name); + real_acl = vfs_get_acl(idmap, path->dentry, acl_name); if (IS_ERR_OR_NULL(real_acl)) return real_acl; @@ -540,7 +540,7 @@ struct posix_acl *ovl_get_acl_path(const struct path *path, if (!clone) return ERR_PTR(-ENOMEM); - ovl_idmap_posix_acl(realinode, mnt_userns, clone); + ovl_idmap_posix_acl(realinode, idmap, clone); return clone; } @@ -555,7 +555,7 @@ struct posix_acl *ovl_get_acl_path(const struct path *path, * * This is obviously only relevant when idmapped layers are used. */ -struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu, bool noperm) { @@ -618,7 +618,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - real_acl = vfs_get_acl(mnt_user_ns(realpath.mnt), realdentry, + real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, acl_name); revert_creds(old_cred); if (IS_ERR(real_acl)) { @@ -651,7 +651,7 @@ out_drop_write: return err; } -int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int err; @@ -665,7 +665,7 @@ int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, return -EOPNOTSUPP; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EPERM; /* @@ -674,10 +674,10 @@ int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, */ if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS && !in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) { + !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) { struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; - err = ovl_setattr(&init_user_ns, dentry, &iattr); + err = ovl_setattr(&nop_mnt_idmap, dentry, &iattr); if (err) return err; } @@ -755,10 +755,10 @@ int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa) if (err) return err; - return vfs_fileattr_set(mnt_user_ns(realpath->mnt), realpath->dentry, fa); + return vfs_fileattr_set(mnt_idmap(realpath->mnt), realpath->dentry, fa); } -int ovl_fileattr_set(struct user_namespace *mnt_userns, +int ovl_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 46753134533a..cfb3420b7df0 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -204,7 +204,7 @@ static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_unlocked(mnt_user_ns(d->mnt), name, base, len); + struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -711,7 +711,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, if (err) return ERR_PTR(err); - index = lookup_one_positive_unlocked(ovl_upper_mnt_userns(ofs), name.name, + index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name, ofs->indexdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); @@ -1182,7 +1182,7 @@ bool ovl_lower_positive(struct dentry *dentry) struct dentry *this; struct dentry *lowerdir = poe->lowerstack[i].dentry; - this = lookup_one_positive_unlocked(mnt_user_ns(poe->lowerstack[i].layer->mnt), + this = lookup_one_positive_unlocked(mnt_idmap(poe->lowerstack[i].layer->mnt), name->name, lowerdir, name->len); if (IS_ERR(this)) { switch (PTR_ERR(this)) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 1df7f850ff3b..4d0b278f5630 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -141,13 +141,13 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs, struct dentry *upperdentry, struct iattr *attr) { - return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL); + return notify_change(ovl_upper_mnt_idmap(ofs), upperdentry, attr, NULL); } static inline int ovl_do_rmdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - int err = vfs_rmdir(ovl_upper_mnt_userns(ofs), dir, dentry); + int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry); pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; @@ -156,7 +156,7 @@ static inline int ovl_do_rmdir(struct ovl_fs *ofs, static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - int err = vfs_unlink(ovl_upper_mnt_userns(ofs), dir, dentry, NULL); + int err = vfs_unlink(ovl_upper_mnt_idmap(ofs), dir, dentry, NULL); pr_debug("unlink(%pd2) = %i\n", dentry, err); return err; @@ -165,7 +165,8 @@ static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir, static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { - int err = vfs_link(old_dentry, ovl_upper_mnt_userns(ofs), dir, new_dentry, NULL); + int err = vfs_link(old_dentry, ovl_upper_mnt_idmap(ofs), dir, + new_dentry, NULL); pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); return err; @@ -175,7 +176,7 @@ static inline int ovl_do_create(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_create(ovl_upper_mnt_userns(ofs), dir, dentry, mode, true); + int err = vfs_create(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, true); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; @@ -185,7 +186,7 @@ static inline int ovl_do_mkdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_mkdir(ovl_upper_mnt_userns(ofs), dir, dentry, mode); + int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } @@ -194,7 +195,7 @@ static inline int ovl_do_mknod(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - int err = vfs_mknod(ovl_upper_mnt_userns(ofs), dir, dentry, mode, dev); + int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev); pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; @@ -204,7 +205,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, const char *oldname) { - int err = vfs_symlink(ovl_upper_mnt_userns(ofs), dir, dentry, oldname); + int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname); pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; @@ -217,7 +218,7 @@ static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name, WARN_ON(path->dentry->d_sb != path->mnt->mnt_sb); - err = vfs_getxattr(mnt_user_ns(path->mnt), path->dentry, + err = vfs_getxattr(mnt_idmap(path->mnt), path->dentry, name, value, size); len = (value && err > 0) ? err : 0; @@ -251,7 +252,7 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, + int err = vfs_setxattr(ovl_upper_mnt_idmap(ofs), dentry, name, value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", @@ -269,7 +270,7 @@ static inline int ovl_setxattr(struct ovl_fs *ofs, struct dentry *dentry, static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name) { - int err = vfs_removexattr(ovl_upper_mnt_userns(ofs), dentry, name); + int err = vfs_removexattr(ovl_upper_mnt_idmap(ofs), dentry, name); pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); return err; } @@ -283,13 +284,13 @@ static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry, static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry, const char *acl_name, struct posix_acl *acl) { - return vfs_set_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name, acl); + return vfs_set_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name, acl); } static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, const char *acl_name) { - return vfs_remove_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name); + return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name); } static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, @@ -298,10 +299,10 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, { int err; struct renamedata rd = { - .old_mnt_userns = ovl_upper_mnt_userns(ofs), + .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_dir = olddir, .old_dentry = olddentry, - .new_mnt_userns = ovl_upper_mnt_userns(ofs), + .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), .new_dir = newdir, .new_dentry = newdentry, .flags = flags, @@ -319,7 +320,7 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, static inline int ovl_do_whiteout(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - int err = vfs_whiteout(ovl_upper_mnt_userns(ofs), dir, dentry); + int err = vfs_whiteout(ovl_upper_mnt_idmap(ofs), dir, dentry); pr_debug("whiteout(%pd2) = %i\n", dentry, err); return err; } @@ -328,7 +329,7 @@ static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, struct dentry *dentry, umode_t mode) { struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; - struct file *file = vfs_tmpfile_open(ovl_upper_mnt_userns(ofs), &path, mode, + struct file *file = vfs_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, mode, O_LARGEFILE | O_WRONLY, current_cred()); int err = PTR_ERR_OR_ZERO(file); @@ -340,7 +341,7 @@ static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, const char *name, struct dentry *base, int len) { - return lookup_one(ovl_upper_mnt_userns(ofs), name, base, len); + return lookup_one(ovl_upper_mnt_idmap(ofs), name, base, len); } static inline bool ovl_open_flags_need_copy_up(int flags) @@ -596,11 +597,11 @@ int ovl_set_nlink_lower(struct dentry *dentry); unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, struct dentry *upperdentry, unsigned int fallback); -int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); -int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ovl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags); @@ -609,20 +610,20 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); #ifdef CONFIG_FS_POSIX_ACL -struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu, bool noperm); static inline struct posix_acl *ovl_get_inode_acl(struct inode *inode, int type, bool rcu) { - return do_ovl_get_acl(&init_user_ns, inode, type, rcu, true); + return do_ovl_get_acl(&nop_mnt_idmap, inode, type, rcu, true); } -static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns, +static inline struct posix_acl *ovl_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { - return do_ovl_get_acl(mnt_userns, d_inode(dentry), type, false, false); + return do_ovl_get_acl(idmap, d_inode(dentry), type, false, false); } -int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); struct posix_acl *ovl_get_acl_path(const struct path *path, const char *acl_name, bool noperm); @@ -717,7 +718,7 @@ void ovl_aio_request_cache_destroy(void); int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ovl_fileattr_set(struct user_namespace *mnt_userns, +int ovl_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* copy_up.c */ diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index e1af8f660698..fd11fe6d6d45 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -90,9 +90,9 @@ static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs) return ofs->layers[0].mnt; } -static inline struct user_namespace *ovl_upper_mnt_userns(struct ovl_fs *ofs) +static inline struct mnt_idmap *ovl_upper_mnt_idmap(struct ovl_fs *ofs) { - return mnt_user_ns(ovl_upper_mnt(ofs)); + return mnt_idmap(ovl_upper_mnt(ofs)); } static inline struct ovl_fs *OVL_FS(struct super_block *sb) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 8cd2b9947de1..b6952b21a7ee 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -278,7 +278,7 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data while (rdd->first_maybe_whiteout) { p = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len); + dentry = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); if (!IS_ERR(dentry)) { p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); @@ -480,7 +480,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry goto get; } } - this = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len); + this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ p->is_whiteout = true; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 85b891152a2c..f1d9f75f8786 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1012,7 +1012,7 @@ static int ovl_own_xattr_get(const struct xattr_handler *handler, } static int ovl_own_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -1028,7 +1028,7 @@ static int ovl_other_xattr_get(const struct xattr_handler *handler, } static int ovl_other_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index bde291623c8c..923d66d131c1 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -491,7 +491,7 @@ bool ovl_is_whiteout(struct dentry *dentry) struct file *ovl_path_open(const struct path *path, int flags) { struct inode *inode = d_inode(path->dentry); - struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt); + struct mnt_idmap *real_idmap = mnt_idmap(path->mnt); int err, acc_mode; if (flags & ~(O_ACCMODE | O_LARGEFILE)) @@ -508,12 +508,12 @@ struct file *ovl_path_open(const struct path *path, int flags) BUG(); } - err = inode_permission(real_mnt_userns, inode, acc_mode | MAY_OPEN); + err = inode_permission(real_idmap, inode, acc_mode | MAY_OPEN); if (err) return ERR_PTR(err); /* O_NOATIME is an optimization, don't fail if not permitted */ - if (inode_owner_or_capable(real_mnt_userns, inode)) + if (inode_owner_or_capable(real_idmap, inode)) flags |= O_NOATIME; return dentry_open(path, flags, current_cred()); @@ -1101,16 +1101,16 @@ void ovl_copyattr(struct inode *inode) { struct path realpath; struct inode *realinode; - struct user_namespace *real_mnt_userns; + struct mnt_idmap *real_idmap; vfsuid_t vfsuid; vfsgid_t vfsgid; ovl_i_path_real(inode, &realpath); realinode = d_inode(realpath.dentry); - real_mnt_userns = mnt_user_ns(realpath.mnt); + real_idmap = mnt_idmap(realpath.mnt); - vfsuid = i_uid_into_vfsuid(real_mnt_userns, realinode); - vfsgid = i_gid_into_vfsgid(real_mnt_userns, realinode); + vfsuid = i_uid_into_vfsuid(real_idmap, realinode); + vfsgid = i_gid_into_vfsgid(real_idmap, realinode); inode->i_uid = vfsuid_into_kuid(vfsuid); inode->i_gid = vfsgid_into_kgid(vfsgid); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index d7bc81fc0840..5a76fb35923a 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -28,6 +28,7 @@ #include <linux/security.h> #include <linux/evm.h> #include <linux/fsnotify.h> +#include <linux/filelock.h> #include "internal.h" @@ -111,7 +112,7 @@ void forget_all_cached_acls(struct inode *inode) } EXPORT_SYMBOL(forget_all_cached_acls); -static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, +static struct posix_acl *__get_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, int type) { @@ -154,7 +155,7 @@ static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, * we'll just create the negative cache entry. */ if (dentry && inode->i_op->get_acl) { - acl = inode->i_op->get_acl(mnt_userns, dentry, type); + acl = inode->i_op->get_acl(idmap, dentry, type); } else if (inode->i_op->get_inode_acl) { acl = inode->i_op->get_inode_acl(inode, type, false); } else { @@ -174,14 +175,14 @@ static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, * Cache the result, but only if our sentinel is still in place. */ posix_acl_dup(acl); - if (unlikely(cmpxchg(p, sentinel, acl) != sentinel)) + if (unlikely(!try_cmpxchg(p, &sentinel, acl))) posix_acl_release(acl); return acl; } struct posix_acl *get_inode_acl(struct inode *inode, int type) { - return __get_acl(&init_user_ns, NULL, inode, type); + return __get_acl(&nop_mnt_idmap, NULL, inode, type); } EXPORT_SYMBOL(get_inode_acl); @@ -372,7 +373,7 @@ EXPORT_SYMBOL(posix_acl_from_mode); * by the acl. Returns -E... otherwise. */ int -posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, +posix_acl_permission(struct mnt_idmap *idmap, struct inode *inode, const struct posix_acl *acl, int want) { const struct posix_acl_entry *pa, *pe, *mask_obj; @@ -387,18 +388,18 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto check_perm; break; case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, fs_userns, + vfsuid = make_vfsuid(idmap, fs_userns, pa->e_uid); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: - vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) @@ -406,7 +407,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, } break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, fs_userns, + vfsgid = make_vfsgid(idmap, fs_userns, pa->e_gid); if (vfsgid_in_group_p(vfsgid)) { found = 1; @@ -591,18 +592,18 @@ EXPORT_SYMBOL(__posix_acl_chmod); /** * posix_acl_chmod - chmod a posix acl * - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @dentry: dentry to check permissions on * @mode: the new mode of @inode * - * If the dentry has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the dentry has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. */ int - posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry, + posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { struct inode *inode = d_inode(dentry); @@ -624,7 +625,7 @@ int ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) return ret; - ret = inode->i_op->set_acl(mnt_userns, dentry, acl, ACL_TYPE_ACCESS); + ret = inode->i_op->set_acl(idmap, dentry, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); return ret; } @@ -683,7 +684,7 @@ EXPORT_SYMBOL_GPL(posix_acl_create); /** * posix_acl_update_mode - update mode in set_acl - * @mnt_userns: user namespace of the mount @inode was found from + * @idmap: idmap of the mount @inode was found from * @inode: target inode * @mode_p: mode (pointer) for update * @acl: acl pointer @@ -695,15 +696,15 @@ EXPORT_SYMBOL_GPL(posix_acl_create); * As with chmod, clear the setgid bit if the caller is not in the owning group * or capable of CAP_FSETID (see inode_change_ok). * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before checking + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. * * Called from set_acl inode operations. */ -int posix_acl_update_mode(struct user_namespace *mnt_userns, +int posix_acl_update_mode(struct mnt_idmap *idmap, struct inode *inode, umode_t *mode_p, struct posix_acl **acl) { @@ -715,8 +716,8 @@ int posix_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) && + !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; return 0; @@ -893,7 +894,6 @@ static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, struct posix_acl_xattr_header *ext_acl = buffer; struct posix_acl_xattr_entry *ext_entry; struct user_namespace *fs_userns, *caller_userns; - struct user_namespace *mnt_userns; ssize_t real_size, n; vfsuid_t vfsuid; vfsgid_t vfsgid; @@ -909,19 +909,18 @@ static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, fs_userns = i_user_ns(inode); caller_userns = current_user_ns(); - mnt_userns = mnt_idmap_owner(idmap); for (n=0; n < acl->a_count; n++, ext_entry++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); switch(acl_e->e_tag) { case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, fs_userns, acl_e->e_uid); + vfsuid = make_vfsuid(idmap, fs_userns, acl_e->e_uid); ext_entry->e_id = cpu_to_le32(from_kuid( caller_userns, vfsuid_into_kuid(vfsuid))); break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, fs_userns, acl_e->e_gid); + vfsgid = make_vfsgid(idmap, fs_userns, acl_e->e_gid); ext_entry->e_id = cpu_to_le32(from_kgid( caller_userns, vfsgid_into_kgid(vfsgid))); break; @@ -934,7 +933,7 @@ static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, } int -set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +set_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type, struct posix_acl *acl) { struct inode *inode = d_inode(dentry); @@ -946,7 +945,7 @@ set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; - if (!inode_owner_or_capable(mnt_userns, inode)) + if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (acl) { @@ -954,7 +953,7 @@ set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, if (ret) return ret; } - return inode->i_op->set_acl(mnt_userns, dentry, acl, type); + return inode->i_op->set_acl(idmap, dentry, acl, type); } EXPORT_SYMBOL(set_posix_acl); @@ -978,14 +977,14 @@ const struct xattr_handler posix_acl_default_xattr_handler = { }; EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); -int simple_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error; struct inode *inode = d_inode(dentry); if (type == ACL_TYPE_ACCESS) { - error = posix_acl_update_mode(mnt_userns, inode, + error = posix_acl_update_mode(idmap, inode, &inode->i_mode, &acl); if (error) return error; @@ -1017,7 +1016,7 @@ int simple_acl_create(struct inode *dir, struct inode *inode) return 0; } -static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, +static int vfs_set_acl_idmapped_mnt(struct mnt_idmap *idmap, struct user_namespace *fs_userns, struct posix_acl *acl) { @@ -1026,11 +1025,11 @@ static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, switch (acl_e->e_tag) { case ACL_USER: - acl_e->e_uid = from_vfsuid(mnt_userns, fs_userns, + acl_e->e_uid = from_vfsuid(idmap, fs_userns, VFSUIDT_INIT(acl_e->e_uid)); break; case ACL_GROUP: - acl_e->e_gid = from_vfsgid(mnt_userns, fs_userns, + acl_e->e_gid = from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(acl_e->e_gid)); break; } @@ -1041,7 +1040,7 @@ static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, /** * vfs_set_acl - set posix acls - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @dentry: the dentry based on which to set the posix acls * @acl_name: the name of the posix acl * @kacl: the posix acls in the appropriate VFS format @@ -1051,7 +1050,7 @@ static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, * * Return: On success 0, on error negative errno. */ -int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { int acl_type; @@ -1071,7 +1070,7 @@ int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, * if this is a filesystem with a backing store - ultimately * translate them to backing store values. */ - error = vfs_set_acl_idmapped_mnt(mnt_userns, i_user_ns(inode), kacl); + error = vfs_set_acl_idmapped_mnt(idmap, i_user_ns(inode), kacl); if (error) return error; } @@ -1083,11 +1082,11 @@ retry_deleg: * We only care about restrictions the inode struct itself places upon * us otherwise POSIX ACLs aren't subject to any VFS restrictions. */ - error = may_write_xattr(mnt_userns, inode); + error = may_write_xattr(idmap, inode); if (error) goto out_inode_unlock; - error = security_inode_set_acl(mnt_userns, dentry, acl_name, kacl); + error = security_inode_set_acl(idmap, dentry, acl_name, kacl); if (error) goto out_inode_unlock; @@ -1096,7 +1095,7 @@ retry_deleg: goto out_inode_unlock; if (inode->i_opflags & IOP_XATTR) - error = set_posix_acl(mnt_userns, dentry, acl_type, kacl); + error = set_posix_acl(idmap, dentry, acl_type, kacl); else if (unlikely(is_bad_inode(inode))) error = -EIO; else @@ -1121,7 +1120,7 @@ EXPORT_SYMBOL_GPL(vfs_set_acl); /** * vfs_get_acl - get posix acls - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @dentry: the dentry based on which to retrieve the posix acls * @acl_name: the name of the posix acl * @@ -1130,7 +1129,7 @@ EXPORT_SYMBOL_GPL(vfs_set_acl); * * Return: On success POSIX ACLs in VFS format, on error negative errno. */ -struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, +struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { struct inode *inode = d_inode(dentry); @@ -1145,7 +1144,7 @@ struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, * The VFS has no restrictions on reading POSIX ACLs so calling * something like xattr_permission() isn't needed. Only LSMs get a say. */ - error = security_inode_get_acl(mnt_userns, dentry, acl_name); + error = security_inode_get_acl(idmap, dentry, acl_name); if (error) return ERR_PTR(error); @@ -1154,7 +1153,7 @@ struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, if (S_ISLNK(inode->i_mode)) return ERR_PTR(-EOPNOTSUPP); - acl = __get_acl(mnt_userns, dentry, inode, acl_type); + acl = __get_acl(idmap, dentry, inode, acl_type); if (IS_ERR(acl)) return acl; if (!acl) @@ -1166,7 +1165,7 @@ EXPORT_SYMBOL_GPL(vfs_get_acl); /** * vfs_remove_acl - remove posix acls - * @mnt_userns: user namespace of the mount + * @idmap: idmap of the mount * @dentry: the dentry based on which to retrieve the posix acls * @acl_name: the name of the posix acl * @@ -1174,7 +1173,7 @@ EXPORT_SYMBOL_GPL(vfs_get_acl); * * Return: On success 0, on error negative errno. */ -int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { int acl_type; @@ -1193,11 +1192,11 @@ retry_deleg: * We only care about restrictions the inode struct itself places upon * us otherwise POSIX ACLs aren't subject to any VFS restrictions. */ - error = may_write_xattr(mnt_userns, inode); + error = may_write_xattr(idmap, inode); if (error) goto out_inode_unlock; - error = security_inode_remove_acl(mnt_userns, dentry, acl_name); + error = security_inode_remove_acl(idmap, dentry, acl_name); if (error) goto out_inode_unlock; @@ -1206,14 +1205,14 @@ retry_deleg: goto out_inode_unlock; if (inode->i_opflags & IOP_XATTR) - error = set_posix_acl(mnt_userns, dentry, acl_type, NULL); + error = set_posix_acl(idmap, dentry, acl_type, NULL); else if (unlikely(is_bad_inode(inode))) error = -EIO; else error = -EOPNOTSUPP; if (!error) { fsnotify_xattr(dentry); - evm_inode_post_remove_acl(mnt_userns, dentry, acl_name); + evm_inode_post_remove_acl(idmap, dentry, acl_name); } out_inode_unlock: @@ -1245,7 +1244,7 @@ int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, return PTR_ERR(acl); } - error = vfs_set_acl(mnt_idmap_owner(idmap), dentry, acl_name, acl); + error = vfs_set_acl(idmap, dentry, acl_name, acl); posix_acl_release(acl); return error; } @@ -1256,7 +1255,7 @@ ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, ssize_t error; struct posix_acl *acl; - acl = vfs_get_acl(mnt_idmap_owner(idmap), dentry, acl_name); + acl = vfs_get_acl(idmap, dentry, acl_name); if (IS_ERR(acl)) return PTR_ERR(acl); diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e479d7d202b..5e0e0ccd47aa 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -685,7 +685,7 @@ static bool proc_fd_access_allowed(struct inode *inode) return allowed; } -int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int error; @@ -694,11 +694,11 @@ int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (attr->ia_valid & ATTR_MODE) return -EPERM; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -727,7 +727,7 @@ static bool has_pid_permissions(struct proc_fs_info *fs_info, } -static int proc_pid_permission(struct user_namespace *mnt_userns, +static int proc_pid_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); @@ -753,7 +753,7 @@ static int proc_pid_permission(struct user_namespace *mnt_userns, return -EPERM; } - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } @@ -1959,14 +1959,14 @@ static struct inode *proc_pid_make_base_inode(struct super_block *sb, return inode; } -int pid_getattr(struct user_namespace *mnt_userns, const struct path *path, +int pid_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; @@ -3557,7 +3557,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) * This function makes sure that the node is always accessible for members of * same thread group. */ -static int proc_tid_comm_permission(struct user_namespace *mnt_userns, +static int proc_tid_comm_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { bool is_same_tgroup; @@ -3577,7 +3577,7 @@ static int proc_tid_comm_permission(struct user_namespace *mnt_userns, return 0; } - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_tid_comm_inode_operations = { @@ -3891,13 +3891,13 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) return 0; } -static int proc_task_getattr(struct user_namespace *mnt_userns, +static int proc_task_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (p) { stat->nlink += get_nr_threads(p); diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 91fe1597af7b..a6f76121955f 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -17,6 +17,7 @@ static int __init proc_cmdline_init(void) struct proc_dir_entry *pde; pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + pde_make_permanent(pde); pde->size = saved_command_line_len + 1; return 0; } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index fc46d6fe080c..b3140deebbbf 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -12,6 +12,7 @@ #include <linux/file.h> #include <linux/seq_file.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/proc_fs.h> @@ -325,13 +326,13 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -int proc_fd_permission(struct user_namespace *mnt_userns, +int proc_fd_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct task_struct *p; int rv; - rv = generic_permission(&init_user_ns, inode, mask); + rv = generic_permission(&nop_mnt_idmap, inode, mask); if (rv == 0) return rv; @@ -344,14 +345,14 @@ int proc_fd_permission(struct user_namespace *mnt_userns, return rv; } -static int proc_fd_getattr(struct user_namespace *mnt_userns, +static int proc_fd_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); int rv = 0; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); /* If it's a directory, put the number of open fds there */ if (S_ISDIR(inode->i_mode)) { diff --git a/fs/proc/fd.h b/fs/proc/fd.h index c5a921a06a0b..7e7265f7e06f 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -10,7 +10,7 @@ extern const struct inode_operations proc_fd_inode_operations; extern const struct file_operations proc_fdinfo_operations; extern const struct inode_operations proc_fdinfo_inode_operations; -extern int proc_fd_permission(struct user_namespace *mnt_userns, +extern int proc_fd_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); static inline unsigned int proc_fd(struct inode *inode) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 587b91d9d998..8379593fa4bb 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -115,18 +115,18 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, return true; } -static int proc_notify_change(struct user_namespace *mnt_userns, +static int proc_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); int error; - error = setattr_prepare(&init_user_ns, dentry, iattr); + error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) return error; - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); proc_set_user(de, inode->i_uid, inode->i_gid); @@ -134,7 +134,7 @@ static int proc_notify_change(struct user_namespace *mnt_userns, return 0; } -static int proc_getattr(struct user_namespace *mnt_userns, +static int proc_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -147,7 +147,7 @@ static int proc_getattr(struct user_namespace *mnt_userns, } } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index b701d0207edf..9dda7e54b2d0 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -162,9 +162,9 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, * base.c */ extern const struct dentry_operations pid_dentry_operations; -extern int pid_getattr(struct user_namespace *, const struct path *, +extern int pid_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int proc_setattr(struct user_namespace *, struct dentry *, +extern int proc_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 856839b8ae8b..a0c0419872e3 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -299,7 +299,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, return de; } -static int proc_tgid_net_getattr(struct user_namespace *mnt_userns, +static int proc_tgid_net_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -308,7 +308,7 @@ static int proc_tgid_net_getattr(struct user_namespace *mnt_userns, net = get_proc_task_net(inode); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 48f2d60bd78a..5851eb5bc726 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -798,7 +798,7 @@ out: return 0; } -static int proc_sys_permission(struct user_namespace *mnt_userns, +static int proc_sys_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { /* @@ -827,7 +827,7 @@ static int proc_sys_permission(struct user_namespace *mnt_userns, return error; } -static int proc_sys_setattr(struct user_namespace *mnt_userns, +static int proc_sys_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -836,16 +836,16 @@ static int proc_sys_setattr(struct user_namespace *mnt_userns, if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } -static int proc_sys_getattr(struct user_namespace *mnt_userns, +static int proc_sys_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -856,7 +856,7 @@ static int proc_sys_getattr(struct user_namespace *mnt_userns, if (IS_ERR(head)) return PTR_ERR(head); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; @@ -1124,6 +1124,11 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) err |= sysctl_err(path, table, "array not allowed"); } + if (table->proc_handler == proc_dobool) { + if (table->maxlen != sizeof(bool)) + err |= sysctl_err(path, table, "array not allowed"); + } + return err; } @@ -1136,6 +1141,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) err |= sysctl_err(path, entry, "Not a file"); if ((entry->proc_handler == proc_dostring) || + (entry->proc_handler == proc_dobool) || (entry->proc_handler == proc_dointvec) || (entry->proc_handler == proc_douintvec) || (entry->proc_handler == proc_douintvec_minmax) || diff --git a/fs/proc/root.c b/fs/proc/root.c index 3c2ee3eb1138..a86e65a608da 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -310,11 +310,11 @@ void __init proc_root_init(void) register_filesystem(&proc_fs_type); } -static int proc_root_getattr(struct user_namespace *mnt_userns, +static int proc_root_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e35a0398db63..6a96e1713fd5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -745,9 +745,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, page = pfn_swap_entry_to_page(swpent); } if (page) { - int mapcount = page_mapcount(page); - - if (mapcount >= 2) + if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); @@ -892,7 +890,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) @@ -910,7 +908,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_mm; hold_task_mempolicy(priv); - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); if (unlikely(!vma)) goto empty_set; @@ -925,7 +923,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * access it for write request. */ if (mmap_lock_is_contended(mm)) { - mas_pause(&mas); + vma_iter_invalidate(&vmi); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { @@ -950,31 +948,31 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * * 1) VMA2 is freed, but VMA3 exists: * - * find_vma(mm, 16k - 1) will return VMA3. + * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 2) VMA2 still exists: * - * find_vma(mm, 16k - 1) will return VMA2. - * Iterate the loop like the original one. + * vma_next(vmi) will return VMA3. + * In this case, just continue from VMA3. * * 3) No more VMAs can be found: * - * find_vma(mm, 16k - 1) will return NULL. + * vma_next(vmi) will return NULL. * No more things to do, just break. * * 4) (last_vma_end - 1) is the middle of a vma (VMA'): * - * find_vma(mm, 16k - 1) will return VMA' whose range + * vma_next(vmi) will return VMA' whose range * contains last_vma_end. * Iterate VMA' from last_vma_end. */ - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); /* Case 3 above */ if (!vma) break; - /* Case 1 above */ + /* Case 1 and 2 above */ if (vma->vm_start >= last_vma_end) continue; @@ -982,8 +980,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) if (vma->vm_end > last_vma_end) smap_gather_stats(vma, &mss, last_vma_end); } - /* Case 2 above */ - } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + } for_each_vma(vmi, vma); empty_set: show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); @@ -1279,7 +1276,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, @@ -1299,16 +1296,16 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } if (type == CLEAR_REFS_SOFT_DIRTY) { - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; - vma->vm_flags &= ~VM_SOFTDIRTY; + vm_flags_clear(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, - 0, NULL, mm, 0, -1UL); + 0, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 2fd06f52b6a4..0ec35072a8e5 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -38,7 +38,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) } if (atomic_read(&mm->mm_count) > 1 || - vma->vm_flags & VM_MAYSHARE) { + is_nommu_shared_mapping(vma->vm_flags)) { sbytes += size; } else { bytes += size; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 09a81e4b1273..12af614f33ce 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -582,8 +582,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_MAYWRITE | VM_MAYEXEC); vma->vm_ops = &vmcore_mmap_ops; len = 0; diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index b59cd172b5f9..d5a85a8062d0 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -6,7 +6,6 @@ config QUOTA bool "Quota support" select QUOTACTL - select SRCU help If you say Y here, you will be able to set per user limits for disk usage (also called disk quotas). Currently, it works for the diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f27faf5db554..a6357f728034 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2085,7 +2085,7 @@ EXPORT_SYMBOL(__dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ -int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, +int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; @@ -2096,8 +2096,8 @@ int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, if (!dquot_active(inode)) return 0; - if (i_uid_needs_update(mnt_userns, iattr, inode)) { - kuid_t kuid = from_vfsuid(mnt_userns, i_user_ns(inode), + if (i_uid_needs_update(idmap, iattr, inode)) { + kuid_t kuid = from_vfsuid(idmap, i_user_ns(inode), iattr->ia_vfsuid); dquot = dqget(sb, make_kqid_uid(kuid)); @@ -2110,8 +2110,8 @@ int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, } transfer_to[USRQUOTA] = dquot; } - if (i_gid_needs_update(mnt_userns, iattr, inode)) { - kgid_t kgid = from_vfsgid(mnt_userns, i_user_ns(inode), + if (i_gid_needs_update(idmap, iattr, inode)) { + kgid_t kgid = from_vfsgid(idmap, i_user_ns(inode), iattr->ia_vfsgid); dquot = dqget(sb, make_kqid_gid(kgid)); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index cb240eac5036..2f67516bb9bf 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -22,7 +22,7 @@ #include <linux/uaccess.h> #include "internal.h" -static int ramfs_nommu_setattr(struct user_namespace *, struct dentry *, struct iattr *); +static int ramfs_nommu_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, @@ -158,7 +158,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) * handle a change of attributes * - we're specifically interested in a change of size */ -static int ramfs_nommu_setattr(struct user_namespace *mnt_userns, +static int ramfs_nommu_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { struct inode *inode = d_inode(dentry); @@ -166,7 +166,7 @@ static int ramfs_nommu_setattr(struct user_namespace *mnt_userns, int ret = 0; /* POSIX UID/GID verification for setting inode attributes */ - ret = setattr_prepare(&init_user_ns, dentry, ia); + ret = setattr_prepare(&nop_mnt_idmap, dentry, ia); if (ret) return ret; @@ -186,7 +186,7 @@ static int ramfs_nommu_setattr(struct user_namespace *mnt_userns, } } - setattr_copy(&init_user_ns, inode, ia); + setattr_copy(&nop_mnt_idmap, inode, ia); out: ia->ia_valid = old_ia_valid; return ret; @@ -264,7 +264,7 @@ out: */ static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) { - if (!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) + if (!is_nommu_shared_mapping(vma->vm_flags)) return -ENOSYS; file_accessed(file); diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index b3257e852820..5ba580c78835 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -61,7 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_mapping->a_ops = &ram_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); @@ -95,7 +95,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, */ /* SMP-safe */ static int -ramfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); @@ -110,22 +110,22 @@ ramfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return error; } -static int ramfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - int retval = ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); + int retval = ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } -static int ramfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ramfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + return ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); } -static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode; @@ -145,7 +145,7 @@ static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, return error; } -static int ramfs_tmpfile(struct user_namespace *mnt_userns, +static int ramfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index 33c8b0dd07a2..4d22ecfe0fab 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -2,6 +2,7 @@ config REISERFS_FS tristate "Reiserfs support (deprecated)" select CRC32 + select LEGACY_DIRECT_IO help Reiserfs is deprecated and scheduled to be removed from the kernel in 2025. If you are still using it, please migrate to another diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index 29c503a06db4..2571b1a8be84 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -49,7 +49,7 @@ static inline int reiserfs_acl_count(size_t size) #ifdef CONFIG_REISERFS_FS_POSIX_ACL struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu); -int reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +int reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int reiserfs_acl_chmod(struct dentry *dentry); int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index c7d1fa526dea..d54cab854f60 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3262,21 +3262,21 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return ret; } -int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid; int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - if (is_quota_modification(mnt_userns, inode, attr)) { + if (is_quota_modification(&nop_mnt_idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; @@ -3359,7 +3359,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, reiserfs_write_unlock(inode->i_sb); if (error) goto out; - error = dquot_transfer(mnt_userns, inode, attr); + error = dquot_transfer(&nop_mnt_idmap, inode, attr); reiserfs_write_lock(inode->i_sb); if (error) { journal_end(&th); @@ -3398,7 +3398,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } if (!error) { - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 4b86ecf5817e..6bf9b54e58ca 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -24,7 +24,7 @@ int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int reiserfs_fileattr_set(struct user_namespace *mnt_userns, +int reiserfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); @@ -96,7 +96,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = put_user(inode->i_generation, (int __user *)arg); break; case REISERFS_IOC_SETVERSION: - if (!inode_owner_or_capable(&init_user_ns, inode)) { + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) { err = -EPERM; break; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 9f62da7471c9..9ce4ec296b74 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -601,7 +601,7 @@ static int journal_list_still_alive(struct super_block *s, */ static void release_buffer_page(struct buffer_head *bh) { - struct folio *folio = page_folio(bh->b_page); + struct folio *folio = bh->b_folio; if (!folio->mapping && folio_trylock(folio)) { folio_get(folio); put_bh(bh); @@ -866,7 +866,7 @@ loop_next: * will ever write the buffer. We're safe if we write the * page one last time after freeing the journal header. */ - if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { + if (buffer_dirty(bh) && unlikely(bh->b_folio->mapping == NULL)) { spin_unlock(lock); write_dirty_buffer(bh, 0); spin_lock(lock); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 0b8aa99749f1..42d2c20e1345 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -616,11 +616,11 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) * the quota init calls have to know who to charge the quota to, so * we have to set uid and gid here */ - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); return dquot_initialize(inode); } -static int reiserfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int reiserfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { int retval; @@ -700,7 +700,7 @@ out_failed: return retval; } -static int reiserfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int reiserfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { int retval; @@ -784,7 +784,7 @@ out_failed: return retval; } -static int reiserfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int reiserfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int retval; @@ -1099,7 +1099,7 @@ out_unlink: return retval; } -static int reiserfs_symlink(struct user_namespace *mnt_userns, +static int reiserfs_symlink(struct mnt_idmap *idmap, struct inode *parent_dir, struct dentry *dentry, const char *symname) { @@ -1311,7 +1311,7 @@ static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de, * one path. If it holds 2 or more, it can get into endless waiting in * get_empty_nodes or its clones */ -static int reiserfs_rename(struct user_namespace *mnt_userns, +static int reiserfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 3aa928ec527a..98e6f53c2fe0 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -3100,7 +3100,7 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, } void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); -int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); @@ -3407,7 +3407,7 @@ __u32 r5_hash(const signed char *msg, int len); /* prototypes from ioctl.c */ int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int reiserfs_fileattr_set(struct user_namespace *mnt_userns, +int reiserfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long reiserfs_compat_ioctl(struct file *filp, diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c index b0ae088dffc7..2cec61af2a9e 100644 --- a/fs/reiserfs/tail_conversion.c +++ b/fs/reiserfs/tail_conversion.c @@ -177,7 +177,7 @@ void reiserfs_unmap_buffer(struct buffer_head *bh) * BUG() on attempt to write not mapped buffer */ if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { - struct inode *inode = bh->b_page->mapping->host; + struct inode *inode = bh->b_folio->mapping->host; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); spin_lock(&j->j_dirty_buffers_lock); list_del_init(&bh->b_assoc_buffers); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8b2d52443f41..06d810c72c52 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -66,14 +66,14 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!inode_is_locked(dir)); - return dir->i_op->create(&init_user_ns, dir, dentry, mode, true); + return dir->i_op->create(&nop_mnt_idmap, dir, dentry, mode, true); } #endif static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { BUG_ON(!inode_is_locked(dir)); - return dir->i_op->mkdir(&init_user_ns, dir, dentry, mode); + return dir->i_op->mkdir(&nop_mnt_idmap, dir, dentry, mode); } /* @@ -352,7 +352,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data) * ATTR_MODE is set. */ attrs->ia_valid &= (ATTR_UID|ATTR_GID); - err = reiserfs_setattr(&init_user_ns, dentry, attrs); + err = reiserfs_setattr(&nop_mnt_idmap, dentry, attrs); attrs->ia_valid = ia_valid; return err; @@ -597,7 +597,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR); inode_dio_wait(d_inode(dentry)); - err = reiserfs_setattr(&init_user_ns, dentry, &newattrs); + err = reiserfs_setattr(&nop_mnt_idmap, dentry, &newattrs); inode_unlock(d_inode(dentry)); } else update_ctime(inode); @@ -941,7 +941,7 @@ static int xattr_mount_check(struct super_block *s) return 0; } -int reiserfs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int reiserfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { /* @@ -951,7 +951,7 @@ int reiserfs_permission(struct user_namespace *mnt_userns, struct inode *inode, if (IS_PRIVATE(inode)) return 0; - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(&nop_mnt_idmap, inode, mask); } static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags) diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index e47fde1182de..5868a4e990e3 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -16,7 +16,7 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags); int reiserfs_lookup_privroot(struct super_block *sb); int reiserfs_delete_xattrs(struct inode *inode); int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); -int reiserfs_permission(struct user_namespace *mnt_userns, +int reiserfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); #ifdef CONFIG_REISERFS_FS_XATTR diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 93fe414fed18..138060452678 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -18,7 +18,7 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, int -reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error, error2; @@ -42,7 +42,7 @@ reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, reiserfs_write_unlock(inode->i_sb); if (error == 0) { if (type == ACL_TYPE_ACCESS && acl) { - error = posix_acl_update_mode(&init_user_ns, inode, + error = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (error) goto unlock; @@ -407,5 +407,5 @@ int reiserfs_acl_chmod(struct dentry *dentry) !reiserfs_posixacl(inode->i_sb)) return 0; - return posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + return posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 857a65b05726..41c0ea84fbff 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -22,7 +22,7 @@ security_get(const struct xattr_handler *handler, struct dentry *unused, static int security_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, struct dentry *unused, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index d853cea2afcd..0c0c74d8db0e 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -21,7 +21,7 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused, static int trusted_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, struct dentry *unused, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 65d9cd10a5ea..88195181e1d7 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -18,7 +18,7 @@ user_get(const struct xattr_handler *handler, struct dentry *unused, } static int -user_set(const struct xattr_handler *handler, struct user_namespace *mnt_userns, +user_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/remap_range.c b/fs/remap_range.c index 41f60477bb41..1331a890f2f2 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -419,16 +419,16 @@ EXPORT_SYMBOL(vfs_clone_file_range); /* Check whether we are allowed to dedupe the destination file */ static bool allow_file_dedupe(struct file *file) { - struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); if (capable(CAP_SYS_ADMIN)) return true; if (file->f_mode & FMODE_WRITE) return true; - if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid())) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) return true; - if (!inode_permission(mnt_userns, inode, MAY_WRITE)) + if (!inode_permission(idmap, inode, MAY_WRITE)) return true; return false; } diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index 2c4a23113fb5..4578dc45e50a 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -63,7 +63,7 @@ static unsigned long romfs_get_unmapped_area(struct file *file, */ static int romfs_mmap(struct file *file, struct vm_area_struct *vma) { - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS; } static unsigned romfs_mmap_capabilities(struct file *file) diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 7d605db3bb3b..ace133cf6072 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -167,7 +167,7 @@ struct smb2_err_rsp { __u8 ErrorContextCount; __u8 Reserved; __le32 ByteCount; /* even if zero, at least one byte follows */ - __u8 ErrorData[1]; /* variable length */ + __u8 ErrorData[]; /* variable length */ } __packed; #define SMB3_AES_CCM_NONCE 11 @@ -308,7 +308,7 @@ struct smb2_tree_connect_req { __le16 Flags; /* Flags in SMB3.1.1 */ __le16 PathOffset; __le16 PathLength; - __u8 Buffer[1]; /* variable length */ + __u8 Buffer[]; /* variable length */ } __packed; /* Possible ShareType values */ @@ -595,7 +595,7 @@ struct smb2_negotiate_rsp { __le16 SecurityBufferOffset; __le16 SecurityBufferLength; __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ - __u8 Buffer[1]; /* variable length GSS security buffer */ + __u8 Buffer[]; /* variable length GSS security buffer */ } __packed; @@ -616,7 +616,7 @@ struct smb2_sess_setup_req { __le16 SecurityBufferOffset; __le16 SecurityBufferLength; __le64 PreviousSessionId; - __u8 Buffer[1]; /* variable length GSS security buffer */ + __u8 Buffer[]; /* variable length GSS security buffer */ } __packed; /* Currently defined SessionFlags */ @@ -633,7 +633,7 @@ struct smb2_sess_setup_rsp { __le16 SessionFlags; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __u8 Buffer[1]; /* variable length GSS security buffer */ + __u8 Buffer[]; /* variable length GSS security buffer */ } __packed; @@ -715,7 +715,7 @@ struct smb2_read_req { __le32 RemainingBytes; __le16 ReadChannelInfoOffset; __le16 ReadChannelInfoLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; /* Read flags */ @@ -730,7 +730,7 @@ struct smb2_read_rsp { __le32 DataLength; __le32 DataRemaining; __le32 Flags; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; @@ -754,7 +754,7 @@ struct smb2_write_req { __le16 WriteChannelInfoOffset; __le16 WriteChannelInfoLength; __le32 Flags; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_write_rsp { @@ -765,7 +765,7 @@ struct smb2_write_rsp { __le32 DataLength; __le32 DataRemaining; __u32 Reserved2; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; @@ -812,7 +812,10 @@ struct smb2_lock_req { __u64 PersistentFileId; __u64 VolatileFileId; /* Followed by at least one */ - struct smb2_lock_element locks[1]; + union { + struct smb2_lock_element lock; + DECLARE_FLEX_ARRAY(struct smb2_lock_element, locks); + }; } __packed; struct smb2_lock_rsp { @@ -866,7 +869,7 @@ struct smb2_query_directory_req { __le16 FileNameOffset; __le16 FileNameLength; __le32 OutputBufferLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_query_directory_rsp { @@ -874,7 +877,7 @@ struct smb2_query_directory_rsp { __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; /* @@ -897,7 +900,7 @@ struct smb2_set_info_req { __le32 AdditionalInformation; __u64 PersistentFileId; __u64 VolatileFileId; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_set_info_rsp { @@ -952,7 +955,7 @@ struct smb2_change_notify_rsp { __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; - __u8 Buffer[1]; /* array of file notify structs */ + __u8 Buffer[]; /* array of file notify structs */ } __packed; @@ -1158,7 +1161,7 @@ struct smb2_create_rsp { __u64 VolatileFileId; __le32 CreateContextsOffset; __le32 CreateContextsLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct create_posix { @@ -1501,7 +1504,7 @@ struct smb2_query_info_req { __le32 Flags; __u64 PersistentFileId; __u64 VolatileFileId; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_query_info_rsp { @@ -1509,7 +1512,7 @@ struct smb2_query_info_rsp { __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; /* @@ -1570,7 +1573,10 @@ struct smb2_file_all_info { /* data block encoding of response to level 18 */ __le32 Mode; __le32 AlignmentRequirement; __le32 FileNameLength; - char FileName[1]; + union { + char __pad; /* Legacy structure padding */ + DECLARE_FLEX_ARRAY(char, FileName); + }; } __packed; /* level 18 Query */ struct smb2_file_eof_info { /* encoding of request for level 10 */ diff --git a/fs/splice.c b/fs/splice.c index 5969b7a1d353..2e76dbb81a8f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -282,6 +282,99 @@ void splice_shrink_spd(struct splice_pipe_desc *spd) kfree(spd->partial); } +/* + * Splice data from an O_DIRECT file into pages and then add them to the output + * pipe. + */ +ssize_t direct_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct iov_iter to; + struct bio_vec *bv; + struct kiocb kiocb; + struct page **pages; + ssize_t ret; + size_t used, npages, chunk, remain, reclaim; + int i; + + /* Work out how much data we can actually add into the pipe */ + used = pipe_occupancy(pipe->head, pipe->tail); + npages = max_t(ssize_t, pipe->max_usage - used, 0); + len = min_t(size_t, len, npages * PAGE_SIZE); + npages = DIV_ROUND_UP(len, PAGE_SIZE); + + bv = kzalloc(array_size(npages, sizeof(bv[0])) + + array_size(npages, sizeof(struct page *)), GFP_KERNEL); + if (!bv) + return -ENOMEM; + + pages = (void *)(bv + npages); + npages = alloc_pages_bulk_array(GFP_USER, npages, pages); + if (!npages) { + kfree(bv); + return -ENOMEM; + } + + remain = len = min_t(size_t, len, npages * PAGE_SIZE); + + for (i = 0; i < npages; i++) { + chunk = min_t(size_t, PAGE_SIZE, remain); + bv[i].bv_page = pages[i]; + bv[i].bv_offset = 0; + bv[i].bv_len = chunk; + remain -= chunk; + } + + /* Do the I/O */ + iov_iter_bvec(&to, ITER_DEST, bv, npages, len); + init_sync_kiocb(&kiocb, in); + kiocb.ki_pos = *ppos; + ret = call_read_iter(in, &kiocb, &to); + + reclaim = npages * PAGE_SIZE; + remain = 0; + if (ret > 0) { + reclaim -= ret; + remain = ret; + *ppos = kiocb.ki_pos; + file_accessed(in); + } else if (ret < 0) { + /* + * callers of ->splice_read() expect -EAGAIN on + * "can't put anything in there", rather than -EFAULT. + */ + if (ret == -EFAULT) + ret = -EAGAIN; + } + + /* Free any pages that didn't get touched at all. */ + reclaim /= PAGE_SIZE; + if (reclaim) { + npages -= reclaim; + release_pages(pages + npages, reclaim); + } + + /* Push the remaining pages into the pipe. */ + for (i = 0; i < npages; i++) { + struct pipe_buffer *buf = pipe_head_buf(pipe); + + chunk = min_t(size_t, remain, PAGE_SIZE); + *buf = (struct pipe_buffer) { + .ops = &default_pipe_buf_ops, + .page = bv[i].bv_page, + .offset = 0, + .len = chunk, + }; + pipe->head++; + remain -= chunk; + } + + kfree(bv); + return ret; +} +EXPORT_SYMBOL(direct_splice_read); + /** * generic_file_splice_read - splice data from file to a pipe * @in: file to splice from @@ -675,9 +768,8 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, goto done; } - array[n].bv_page = buf->page; - array[n].bv_len = this_len; - array[n].bv_offset = buf->offset; + bvec_set_page(&array[n], buf->page, this_len, + buf->offset); left -= this_len; n++; } diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index b3fdc8212c5f..95f8e8901768 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -183,7 +183,7 @@ static inline int squashfs_block_size(__le32 raw) #define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ sizeof(u64)) /* xattr id lookup table defines */ -#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id)) +#define SQUASHFS_XATTR_BYTES(A) (((u64) (A)) * sizeof(struct squashfs_xattr_id)) #define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \ SQUASHFS_METADATA_SIZE) diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 659082e9e51d..72f6f4b37863 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -63,7 +63,7 @@ struct squashfs_sb_info { long long bytes_used; unsigned int inodes; unsigned int fragments; - int xattr_ids; + unsigned int xattr_ids; unsigned int ids; bool panic_on_errors; const struct squashfs_decompressor_thread_ops *thread_ops; diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index d8a270d3ac4c..f1a463d8bfa0 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -10,12 +10,12 @@ #ifdef CONFIG_SQUASHFS_XATTR extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, - u64 *, int *); + u64 *, unsigned int *); extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, unsigned int *, unsigned long long *); #else static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, - u64 start, u64 *xattr_table_start, int *xattr_ids) + u64 start, u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_xattr_id_table *id_table; diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index 087cab8c78f4..c8469c656e0d 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -56,7 +56,7 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, * Read uncompressed xattr id lookup table indexes from disk into memory */ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, - u64 *xattr_table_start, int *xattr_ids) + u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_sb_info *msblk = sb->s_fs_info; unsigned int len, indexes; diff --git a/fs/stat.c b/fs/stat.c index d6cc74ca8486..7c238da22ef0 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -18,6 +18,7 @@ #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/compat.h> +#include <linux/iversion.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -27,7 +28,7 @@ /** * generic_fillattr - Fill in the basic attributes from the inode struct - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: Inode to use as the source * @stat: Where to fill in the attributes * @@ -35,17 +36,17 @@ * found on the VFS inode structure. This is the default if no getattr inode * operation is supplied. * - * If the inode has been found through an idmapped mount the user namespace of - * the vfsmount must be passed through @mnt_userns. This function will then - * take care to map the inode according to @mnt_userns before filling in the + * If the inode has been found through an idmapped mount the idmap of + * the vfsmount must be passed through @idmap. This function will then + * take care to map the inode according to @idmap before filling in the * uid and gid filds. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs init_user_ns. + * performed on the raw inode simply passs @nop_mnt_idmap. */ -void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode, +void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode, struct kstat *stat) { - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; @@ -97,7 +98,7 @@ EXPORT_SYMBOL(generic_fill_statx_attr); int vfs_getattr_nosec(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; struct inode *inode = d_backing_inode(path->dentry); memset(stat, 0, sizeof(*stat)); @@ -122,12 +123,17 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | STATX_ATTR_DAX); - mnt_userns = mnt_user_ns(path->mnt); + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { + stat->result_mask |= STATX_CHANGE_COOKIE; + stat->change_cookie = inode_query_iversion(inode); + } + + idmap = mnt_idmap(path->mnt); if (inode->i_op->getattr) - return inode->i_op->getattr(mnt_userns, path, stat, + return inode->i_op->getattr(idmap, path, stat, request_mask, query_flags); - generic_fillattr(mnt_userns, inode, stat); + generic_fillattr(idmap, inode, stat); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); @@ -602,9 +608,11 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) memset(&tmp, 0, sizeof(tmp)); - tmp.stx_mask = stat->result_mask; + /* STATX_CHANGE_COOKIE is kernel-only for now */ + tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE; tmp.stx_blksize = stat->blksize; - tmp.stx_attributes = stat->attributes; + /* STATX_ATTR_CHANGE_MONOTONIC is kernel-only for now */ + tmp.stx_attributes = stat->attributes & ~STATX_ATTR_CHANGE_MONOTONIC; tmp.stx_nlink = stat->nlink; tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid); tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid); @@ -643,6 +651,11 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags, if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; + /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests + * from userland. + */ + mask &= ~STATX_CHANGE_COOKIE; + error = vfs_statx(dfd, filename, flags, &stat, mask); if (error) return error; diff --git a/fs/super.c b/fs/super.c index 12c08cb20405..84332d5cb817 100644 --- a/fs/super.c +++ b/fs/super.c @@ -291,7 +291,6 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); - fscrypt_destroy_keyring(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); @@ -491,10 +490,23 @@ void generic_shutdown_super(struct super_block *sb) if (sop->put_super) sop->put_super(sb); - if (!list_empty(&sb->s_inodes)) { - printk("VFS: Busy inodes after unmount of %s. " - "Self-destruct in 5 seconds. Have a nice day...\n", - sb->s_id); + if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), + "VFS: Busy inodes after unmount of %s (%s)", + sb->s_id, sb->s_type->name)) { + /* + * Adding a proper bailout path here would be hard, but + * we can at least make it more likely that a later + * iput_final() or such crashes cleanly. + */ + struct inode *inode; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + inode->i_op = VFS_PTR_POISON; + inode->i_sb = VFS_PTR_POISON; + inode->i_mapping = VFS_PTR_POISON; + } + spin_unlock(&sb->s_inode_list_lock); } } spin_lock(&sb_lock); @@ -1764,3 +1776,27 @@ int thaw_super(struct super_block *sb) return thaw_super_locked(sb); } EXPORT_SYMBOL(thaw_super); + +/* + * Create workqueue for deferred direct IO completions. We allocate the + * workqueue when it's first needed. This avoids creating workqueue for + * filesystems that don't need it and also allows us to create the workqueue + * late enough so the we can include s_id in the name of the workqueue. + */ +int sb_init_dio_done_wq(struct super_block *sb) +{ + struct workqueue_struct *old; + struct workqueue_struct *wq = alloc_workqueue("dio/%s", + WQ_MEM_RECLAIM, 0, + sb->s_id); + if (!wq) + return -ENOMEM; + /* + * This has to be atomic as more DIOs can race to create the workqueue + */ + old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); + /* Someone created workqueue before us? Free ours... */ + if (old) + destroy_workqueue(wq); + return 0; +} diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 88e38cd8f5c9..999bceb99974 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -28,37 +28,50 @@ const struct file_operations sysv_dir_operations = { .fsync = generic_file_fsync, }; -static inline void dir_put_page(struct page *page) +inline void dir_put_page(struct page *page, void *page_addr) { - kunmap(page); + kunmap_local((void *)((unsigned long)page_addr & PAGE_MASK)); put_page(page); } -static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - int err = 0; block_write_end(NULL, mapping, pos, len, len, page, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } - if (IS_DIRSYNC(dir)) - err = write_one_page(page); - else - unlock_page(page); + unlock_page(page); +} + +static int sysv_handle_dirsync(struct inode *dir) +{ + int err; + + err = filemap_write_and_wait(dir->i_mapping); + if (!err) + err = sync_inode_metadata(dir, 1); return err; } -static struct page * dir_get_page(struct inode *dir, unsigned long n) +/* + * Calls to dir_get_page()/dir_put_page() must be nested according to the + * rules documented in mm/highmem.rst. + * + * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_page() + * and must be treated accordingly for nesting purposes. + */ +static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p) { struct address_space *mapping = dir->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) - kmap(page); - return page; + if (IS_ERR(page)) + return ERR_CAST(page); + *p = page; + return kmap_local_page(page); } static int sysv_readdir(struct file *file, struct dir_context *ctx) @@ -80,11 +93,11 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct sysv_dir_entry *de; - struct page *page = dir_get_page(inode, n); + struct page *page; - if (IS_ERR(page)) + kaddr = dir_get_page(inode, n, &page); + if (IS_ERR(kaddr)) continue; - kaddr = (char *)page_address(page); de = (struct sysv_dir_entry *)(kaddr+offset); limit = kaddr + PAGE_SIZE - SYSV_DIRSIZE; for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) { @@ -96,11 +109,11 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), fs16_to_cpu(SYSV_SB(sb), de->inode), DT_UNKNOWN)) { - dir_put_page(page); + dir_put_page(page, kaddr); return 0; } } - dir_put_page(page); + dir_put_page(page, kaddr); } return 0; } @@ -123,6 +136,11 @@ static inline int namecompare(int len, int maxlen, * returns the cache buffer in which the entry was found, and the entry * itself (as a parameter - res_dir). It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. + * + * On Success dir_put_page() should be called on *res_page. + * + * sysv_find_entry() acts as a call to dir_get_page() and must be treated + * accordingly for nesting purposes. */ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_page) { @@ -142,11 +160,10 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ n = start; do { - char *kaddr; - page = dir_get_page(dir, n); - if (!IS_ERR(page)) { - kaddr = (char*)page_address(page); - de = (struct sysv_dir_entry *) kaddr; + char *kaddr = dir_get_page(dir, n, &page); + + if (!IS_ERR(kaddr)) { + de = (struct sysv_dir_entry *)kaddr; kaddr += PAGE_SIZE - SYSV_DIRSIZE; for ( ; (char *) de <= kaddr ; de++) { if (!de->inode) @@ -155,7 +172,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ name, de->name)) goto found; } - dir_put_page(page); + dir_put_page(page, kaddr); } if (++n >= npages) @@ -185,11 +202,9 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) /* We take care of directory expansion in the same loop */ for (n = 0; n <= npages; n++) { - page = dir_get_page(dir, n); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; - kaddr = (char*)page_address(page); + kaddr = dir_get_page(dir, n, &page); + if (IS_ERR(kaddr)) + return PTR_ERR(kaddr); de = (struct sysv_dir_entry *)kaddr; kaddr += PAGE_SIZE - SYSV_DIRSIZE; while ((char *)de <= kaddr) { @@ -200,14 +215,13 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) goto out_page; de++; } - dir_put_page(page); + dir_put_page(page, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + - (char*)de - (char*)page_address(page); + pos = page_offset(page) + offset_in_page(de); lock_page(page); err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); if (err) @@ -215,12 +229,12 @@ got_it: memcpy (de->name, name, namelen); memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_commit_chunk(page, pos, SYSV_DIRSIZE); dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); + err = sysv_handle_dirsync(dir); out_page: - dir_put_page(page); -out: + dir_put_page(page, kaddr); return err; out_unlock: unlock_page(page); @@ -230,19 +244,20 @@ out_unlock: int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) { struct inode *inode = page->mapping->host; - char *kaddr = (char*)page_address(page); - loff_t pos = page_offset(page) + (char *)de - kaddr; + loff_t pos = page_offset(page) + offset_in_page(de); int err; lock_page(page); err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); - BUG_ON(err); + if (err) { + unlock_page(page); + return err; + } de->inode = 0; - err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); - dir_put_page(page); + dir_commit_chunk(page, pos, SYSV_DIRSIZE); inode->i_ctime = inode->i_mtime = current_time(inode); mark_inode_dirty(inode); - return err; + return sysv_handle_dirsync(inode); } int sysv_make_empty(struct inode *inode, struct inode *dir) @@ -259,9 +274,7 @@ int sysv_make_empty(struct inode *inode, struct inode *dir) unlock_page(page); goto fail; } - kmap(page); - - base = (char*)page_address(page); + base = kmap_local_page(page); memset(base, 0, PAGE_SIZE); de = (struct sysv_dir_entry *) base; @@ -271,8 +284,9 @@ int sysv_make_empty(struct inode *inode, struct inode *dir) de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino); strcpy(de->name,".."); - kunmap(page); - err = dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE); + kunmap_local(base); + dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE); + err = sysv_handle_dirsync(inode); fail: put_page(page); return err; @@ -286,16 +300,15 @@ int sysv_empty_dir(struct inode * inode) struct super_block *sb = inode->i_sb; struct page *page = NULL; unsigned long i, npages = dir_pages(inode); + char *kaddr; for (i = 0; i < npages; i++) { - char *kaddr; - struct sysv_dir_entry * de; - page = dir_get_page(inode, i); + struct sysv_dir_entry *de; - if (IS_ERR(page)) + kaddr = dir_get_page(inode, i, &page); + if (IS_ERR(kaddr)) continue; - kaddr = (char *)page_address(page); de = (struct sysv_dir_entry *)kaddr; kaddr += PAGE_SIZE-SYSV_DIRSIZE; @@ -314,44 +327,51 @@ int sysv_empty_dir(struct inode * inode) if (de->name[1] != '.' || de->name[2]) goto not_empty; } - dir_put_page(page); + dir_put_page(page, kaddr); } return 1; not_empty: - dir_put_page(page); + dir_put_page(page, kaddr); return 0; } /* Releases the page */ -void sysv_set_link(struct sysv_dir_entry *de, struct page *page, +int sysv_set_link(struct sysv_dir_entry *de, struct page *page, struct inode *inode) { struct inode *dir = page->mapping->host; - loff_t pos = page_offset(page) + - (char *)de-(char*)page_address(page); + loff_t pos = page_offset(page) + offset_in_page(de); int err; lock_page(page); err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); - BUG_ON(err); + if (err) { + unlock_page(page); + return err; + } de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); - dir_put_page(page); + dir_commit_chunk(page, pos, SYSV_DIRSIZE); dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); + return sysv_handle_dirsync(inode); } -struct sysv_dir_entry * sysv_dotdot (struct inode *dir, struct page **p) +/* + * Calls to dir_get_page()/dir_put_page() must be nested according to the + * rules documented in mm/highmem.rst. + * + * sysv_dotdot() acts as a call to dir_get_page() and must be treated + * accordingly for nesting purposes. + */ +struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct page **p) { - struct page *page = dir_get_page(dir, 0); - struct sysv_dir_entry *de = NULL; + struct sysv_dir_entry *de = dir_get_page(dir, 0, p); - if (!IS_ERR(page)) { - de = (struct sysv_dir_entry*) page_address(page) + 1; - *p = page; - } - return de; + if (IS_ERR(de)) + return NULL; + /* ".." is the second directory entry */ + return de + 1; } ino_t sysv_inode_by_name(struct dentry *dentry) @@ -362,7 +382,7 @@ ino_t sysv_inode_by_name(struct dentry *dentry) if (de) { res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); - dir_put_page(page); + dir_put_page(page, de); } return res; } diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 90e00124ea07..50eb92557a0f 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -29,13 +29,13 @@ const struct file_operations sysv_file_operations = { .splice_read = generic_file_splice_read, }; -static int sysv_setattr(struct user_namespace *mnt_userns, +static int sysv_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -48,7 +48,7 @@ static int sysv_setattr(struct user_namespace *mnt_userns, sysv_truncate(inode); } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 50df794a3c1f..e732879036ab 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -163,7 +163,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); dirty_sb(sb); - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 3b8567564e7e..b22764fe669c 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -441,11 +441,11 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) return res; } -int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path, +int sysv_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *s = path->dentry->d_sb; - generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; return 0; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index b2e6abc06a2d..a25862773d82 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -41,7 +41,7 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, un return d_splice_alias(inode, dentry); } -static int sysv_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int sysv_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; @@ -61,13 +61,13 @@ static int sysv_mknod(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int sysv_create(struct user_namespace *mnt_userns, struct inode *dir, +static int sysv_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return sysv_mknod(&init_user_ns, dir, dentry, mode, 0); + return sysv_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } -static int sysv_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int sysv_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int err = -ENAMETOOLONG; @@ -110,7 +110,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int sysv_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int sysv_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode * inode; @@ -153,19 +153,18 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry) struct inode * inode = d_inode(dentry); struct page * page; struct sysv_dir_entry * de; - int err = -ENOENT; + int err; de = sysv_find_entry(dentry, &page); if (!de) - goto out; + return -ENOENT; - err = sysv_delete_entry (de, page); - if (err) - goto out; - - inode->i_ctime = dir->i_ctime; - inode_dec_link_count(inode); -out: + err = sysv_delete_entry(de, page); + if (!err) { + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + } + dir_put_page(page, de); return err; } @@ -189,7 +188,7 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry) * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ -static int sysv_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -227,7 +226,10 @@ static int sysv_rename(struct user_namespace *mnt_userns, struct inode *old_dir, new_de = sysv_find_entry(new_dentry, &new_page); if (!new_de) goto out_dir; - sysv_set_link(new_de, new_page, old_inode); + err = sysv_set_link(new_de, new_page, old_inode); + dir_put_page(new_page, new_de); + if (err) + goto out_dir; new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); @@ -240,23 +242,23 @@ static int sysv_rename(struct user_namespace *mnt_userns, struct inode *old_dir, inode_inc_link_count(new_dir); } - sysv_delete_entry(old_de, old_page); + err = sysv_delete_entry(old_de, old_page); + if (err) + goto out_dir; + mark_inode_dirty(old_inode); if (dir_de) { - sysv_set_link(dir_de, dir_page, new_dir); - inode_dec_link_count(old_dir); + err = sysv_set_link(dir_de, dir_page, new_dir); + if (!err) + inode_dec_link_count(old_dir); } - return 0; out_dir: - if (dir_de) { - kunmap(dir_page); - put_page(dir_page); - } + if (dir_de) + dir_put_page(dir_page, dir_de); out_old: - kunmap(old_page); - put_page(old_page); + dir_put_page(old_page, old_de); out: return err; } diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 99ddf033da4f..f2c36ea42df6 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -141,19 +141,20 @@ extern struct inode *sysv_iget(struct super_block *, unsigned int); extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); extern int sysv_sync_inode(struct inode *); extern void sysv_set_inode(struct inode *, dev_t); -extern int sysv_getattr(struct user_namespace *, const struct path *, +extern int sysv_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int sysv_init_icache(void); extern void sysv_destroy_icache(void); /* dir.c */ +extern void dir_put_page(struct page *page, void *vaddr); extern struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct page **); extern int sysv_add_link(struct dentry *, struct inode *); extern int sysv_delete_entry(struct sysv_dir_entry *, struct page *); extern int sysv_make_empty(struct inode *, struct inode *); extern int sysv_empty_dir(struct inode *); -extern void sysv_set_link(struct sysv_dir_entry *, struct page *, +extern int sysv_set_link(struct sysv_dir_entry *, struct page *, struct inode *); extern struct sysv_dir_entry *sysv_dotdot(struct inode *, struct page **); extern ino_t sysv_inode_by_name(struct dentry *); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index da85b3979195..57ac8aa4a724 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -67,7 +67,7 @@ static char *get_dname(struct dentry *dentry) return name; } -static int tracefs_syscall_mkdir(struct user_namespace *mnt_userns, +static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, struct inode *inode, struct dentry *dentry, umode_t mode) { diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 4509d9fa9e6e..1505539f6fe9 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -95,7 +95,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, */ inode->i_flags |= S_NOCMTIME; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mapping->nrpages = 0; @@ -283,7 +283,7 @@ static int ubifs_prepare_create(struct inode *dir, struct dentry *dentry, return fscrypt_setup_filename(dir, &dentry->d_name, 0, nm); } -static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; @@ -426,7 +426,7 @@ static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) mutex_unlock(&ubifs_inode(inode1)->ui_mutex); } -static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct dentry *dentry = file->f_path.dentry; @@ -979,7 +979,7 @@ out_fname: return err; } -static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -1052,7 +1052,7 @@ out_budg: return err; } -static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -1141,7 +1141,7 @@ out_budg: return err; } -static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode; @@ -1622,7 +1622,7 @@ out: return err; } -static int ubifs_rename(struct user_namespace *mnt_userns, +static int ubifs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) @@ -1647,7 +1647,7 @@ static int ubifs_rename(struct user_namespace *mnt_userns, return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } -int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { loff_t size; @@ -1670,7 +1670,7 @@ int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); stat->blksize = UBIFS_BLOCK_SIZE; stat->size = ui->ui_size; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 10c1779af9c5..979ab1d9d0c3 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1264,7 +1264,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, return err; } -int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int err; @@ -1273,7 +1273,7 @@ int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, dbg_gen("ino %lu, mode %#x, ia_valid %#x", inode->i_ino, inode->i_mode, attr->ia_valid); - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; @@ -1623,11 +1623,11 @@ static const char *ubifs_get_link(struct dentry *dentry, return fscrypt_get_symlink(inode, ui->data, ui->data_len, done); } -static int ubifs_symlink_getattr(struct user_namespace *mnt_userns, +static int ubifs_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - ubifs_getattr(mnt_userns, path, stat, request_mask, query_flags); + ubifs_getattr(idmap, path, stat, request_mask, query_flags); if (IS_ENCRYPTED(d_inode(path->dentry))) return fscrypt_symlink_getattr(path, stat); diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 71bcebe45f9c..67c5108abd89 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -144,7 +144,7 @@ int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -int ubifs_fileattr_set(struct user_namespace *mnt_userns, +int ubifs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 2f1f31581094..4c36044140e7 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2025,15 +2025,15 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc); /* file.c */ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); -int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode, bool is_xattr); -int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); +int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); /* xattr.c */ @@ -2090,7 +2090,7 @@ void ubifs_destroy_size_tree(struct ubifs_info *c); /* ioctl.c */ int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ubifs_fileattr_set(struct user_namespace *mnt_userns, +int ubifs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); void ubifs_set_inode_flags(struct inode *inode); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 3db8486e3725..349228dd1191 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -699,7 +699,7 @@ static int xattr_get(const struct xattr_handler *handler, } static int xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 26e1a49f3ba7..82e8bfa2dfd9 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -3,6 +3,7 @@ config UDF_FS tristate "UDF file system support" select CRC_ITU_T select NLS + select LEGACY_DIRECT_IO help This is a file system used on some CD-ROMs and DVDs. Since the file system is supported by multiple operating systems and is more diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 8e597db4d971..14b9db4c80f0 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -36,18 +36,41 @@ static int read_block_bitmap(struct super_block *sb, unsigned long bitmap_nr) { struct buffer_head *bh = NULL; - int retval = 0; + int i; + int max_bits, off, count; struct kernel_lb_addr loc; loc.logicalBlockNum = bitmap->s_extPosition; loc.partitionReferenceNum = UDF_SB(sb)->s_partition; - bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block)); + bh = sb_bread(sb, udf_get_lb_pblock(sb, &loc, block)); + bitmap->s_block_bitmap[bitmap_nr] = bh; if (!bh) - retval = -EIO; + return -EIO; - bitmap->s_block_bitmap[bitmap_nr] = bh; - return retval; + /* Check consistency of Space Bitmap buffer. */ + max_bits = sb->s_blocksize * 8; + if (!bitmap_nr) { + off = sizeof(struct spaceBitmapDesc) << 3; + count = min(max_bits - off, bitmap->s_nr_groups); + } else { + /* + * Rough check if bitmap number is too big to have any bitmap + * blocks reserved. + */ + if (bitmap_nr > + (bitmap->s_nr_groups >> (sb->s_blocksize_bits + 3)) + 2) + return 0; + off = 0; + count = bitmap->s_nr_groups - bitmap_nr * max_bits + + (sizeof(struct spaceBitmapDesc) << 3); + count = min(count, max_bits); + } + + for (i = 0; i < count; i++) + if (udf_test_bit(i + off, bh->b_data)) + return -EFSCORRUPTED; + return 0; } static int __load_block_bitmap(struct super_block *sb, diff --git a/fs/udf/dir.c b/fs/udf/dir.c index be640f4b2f2c..212393b12c22 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -39,26 +39,13 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) { struct inode *dir = file_inode(file); - struct udf_inode_info *iinfo = UDF_I(dir); - struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL}; - struct fileIdentDesc *fi = NULL; - struct fileIdentDesc cfi; - udf_pblk_t block, iblock; loff_t nf_pos, emit_pos = 0; int flen; - unsigned char *fname = NULL, *copy_name = NULL; - unsigned char *nameptr; - uint16_t liu; - uint8_t lfi; - loff_t size = udf_ext0_offset(dir) + dir->i_size; - struct buffer_head *tmp, *bha[16]; - struct kernel_lb_addr eloc; - uint32_t elen; - sector_t offset; - int i, num, ret = 0; - struct extent_position epos = { NULL, 0, {0, 0} }; + unsigned char *fname = NULL; + int ret = 0; struct super_block *sb = dir->i_sb; bool pos_valid = false; + struct udf_fileident_iter iter; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) @@ -66,7 +53,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) ctx->pos = 1; } nf_pos = (ctx->pos - 1) << 2; - if (nf_pos >= size) + if (nf_pos >= dir->i_size) goto out; /* @@ -90,138 +77,57 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) goto out; } - if (nf_pos == 0) - nf_pos = udf_ext0_offset(dir); - - fibh.soffset = fibh.eoffset = nf_pos & (sb->s_blocksize - 1); - if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, nf_pos >> sb->s_blocksize_bits, - &epos, &eloc, &elen, &offset) - != (EXT_RECORDED_ALLOCATED >> 30)) { - ret = -ENOENT; - goto out; - } - block = udf_get_lb_pblock(sb, &eloc, offset); - if ((++offset << sb->s_blocksize_bits) < elen) { - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (iinfo->i_alloc_type == - ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - } else { - offset = 0; - } - - if (!(fibh.sbh = fibh.ebh = udf_tread(sb, block))) { - ret = -EIO; - goto out; - } - - if (!(offset & ((16 >> (sb->s_blocksize_bits - 9)) - 1))) { - i = 16 >> (sb->s_blocksize_bits - 9); - if (i + offset > (elen >> sb->s_blocksize_bits)) - i = (elen >> sb->s_blocksize_bits) - offset; - for (num = 0; i > 0; i--) { - block = udf_get_lb_pblock(sb, &eloc, offset + i); - tmp = udf_tgetblk(sb, block); - if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) - bha[num++] = tmp; - else - brelse(tmp); - } - if (num) { - bh_readahead_batch(num, bha, REQ_RAHEAD); - for (i = 0; i < num; i++) - brelse(bha[i]); - } - } - } - - while (nf_pos < size) { + for (ret = udf_fiiter_init(&iter, dir, nf_pos); + !ret && iter.pos < dir->i_size; + ret = udf_fiiter_advance(&iter)) { struct kernel_lb_addr tloc; - loff_t cur_pos = nf_pos; + udf_pblk_t iblock; - /* Update file position only if we got past the current one */ - if (nf_pos >= emit_pos) { - ctx->pos = (nf_pos >> 2) + 1; - pos_valid = true; - } - - fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, - &elen, &offset); - if (!fi) - goto out; /* Still not at offset where user asked us to read from? */ - if (cur_pos < emit_pos) + if (iter.pos < emit_pos) continue; - liu = le16_to_cpu(cfi.lengthOfImpUse); - lfi = cfi.lengthFileIdent; - - if (fibh.sbh == fibh.ebh) { - nameptr = udf_get_fi_ident(fi); - } else { - int poffset; /* Unpaded ending offset */ - - poffset = fibh.soffset + sizeof(struct fileIdentDesc) + liu + lfi; - - if (poffset >= lfi) { - nameptr = (char *)(fibh.ebh->b_data + poffset - lfi); - } else { - if (!copy_name) { - copy_name = kmalloc(UDF_NAME_LEN, - GFP_NOFS); - if (!copy_name) { - ret = -ENOMEM; - goto out; - } - } - nameptr = copy_name; - memcpy(nameptr, udf_get_fi_ident(fi), - lfi - poffset); - memcpy(nameptr + lfi - poffset, - fibh.ebh->b_data, poffset); - } - } + /* Update file position only if we got past the current one */ + pos_valid = true; + ctx->pos = (iter.pos >> 2) + 1; - if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { + if (iter.fi.fileCharacteristics & FID_FILE_CHAR_DELETED) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) continue; } - if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { + if (iter.fi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) continue; } - if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) { + if (iter.fi.fileCharacteristics & FID_FILE_CHAR_PARENT) { if (!dir_emit_dotdot(file, ctx)) - goto out; + goto out_iter; continue; } - flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); + flen = udf_get_filename(sb, iter.name, + iter.fi.lengthFileIdent, fname, UDF_NAME_LEN); if (flen < 0) continue; - tloc = lelb_to_cpu(cfi.icb.extLocation); + tloc = lelb_to_cpu(iter.fi.icb.extLocation); iblock = udf_get_lb_pblock(sb, &tloc, 0); if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN)) - goto out; - } /* end while */ - - ctx->pos = (nf_pos >> 2) + 1; - pos_valid = true; + goto out_iter; + } + if (!ret) { + ctx->pos = (iter.pos >> 2) + 1; + pos_valid = true; + } +out_iter: + udf_fiiter_release(&iter); out: if (pos_valid) file->f_version = inode_query_iversion(dir); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); kfree(fname); - kfree(copy_name); return ret; } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 16bcf2c6b8b3..654536d2b609 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -17,183 +17,478 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/bio.h> +#include <linux/crc-itu-t.h> +#include <linux/iversion.h> -struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, - struct udf_fileident_bh *fibh, - struct fileIdentDesc *cfi, - struct extent_position *epos, - struct kernel_lb_addr *eloc, uint32_t *elen, - sector_t *offset) +static int udf_verify_fi(struct udf_fileident_iter *iter) { - struct fileIdentDesc *fi; - int i, num; - udf_pblk_t block; - struct buffer_head *tmp, *bha[16]; - struct udf_inode_info *iinfo = UDF_I(dir); - - fibh->soffset = fibh->eoffset; + unsigned int len; + + if (iter->fi.descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry at pos %llu with incorrect tag %x\n", + iter->dir->i_ino, (unsigned long long)iter->pos, + le16_to_cpu(iter->fi.descTag.tagIdent)); + return -EFSCORRUPTED; + } + len = udf_dir_entry_len(&iter->fi); + if (le16_to_cpu(iter->fi.lengthOfImpUse) & 3) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry at pos %llu with unaligned length of impUse field\n", + iter->dir->i_ino, (unsigned long long)iter->pos); + return -EFSCORRUPTED; + } + /* + * This is in fact allowed by the spec due to long impUse field but + * we don't support it. If there is real media with this large impUse + * field, support can be added. + */ + if (len > 1 << iter->dir->i_blkbits) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has too big (%u) entry at pos %llu\n", + iter->dir->i_ino, len, (unsigned long long)iter->pos); + return -EFSCORRUPTED; + } + if (iter->pos + len > iter->dir->i_size) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry past directory size at pos %llu\n", + iter->dir->i_ino, (unsigned long long)iter->pos); + return -EFSCORRUPTED; + } + if (udf_dir_entry_len(&iter->fi) != + sizeof(struct tag) + le16_to_cpu(iter->fi.descTag.descCRCLength)) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry where CRC length (%u) does not match entry length (%u)\n", + iter->dir->i_ino, + (unsigned)le16_to_cpu(iter->fi.descTag.descCRCLength), + (unsigned)(udf_dir_entry_len(&iter->fi) - + sizeof(struct tag))); + return -EFSCORRUPTED; + } + return 0; +} +static int udf_copy_fi(struct udf_fileident_iter *iter) +{ + struct udf_inode_info *iinfo = UDF_I(iter->dir); + u32 blksize = 1 << iter->dir->i_blkbits; + u32 off, len, nameoff; + int err; + + /* Skip copying when we are at EOF */ + if (iter->pos >= iter->dir->i_size) { + iter->name = NULL; + return 0; + } + if (iter->dir->i_size < iter->pos + sizeof(struct fileIdentDesc)) { + udf_err(iter->dir->i_sb, + "directory (ino %lu) has entry straddling EOF\n", + iter->dir->i_ino); + return -EFSCORRUPTED; + } if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - fi = udf_get_fileident(iinfo->i_data - - (iinfo->i_efe ? - sizeof(struct extendedFileEntry) : - sizeof(struct fileEntry)), - dir->i_sb->s_blocksize, - &(fibh->eoffset)); - if (!fi) - return NULL; - - *nf_pos += fibh->eoffset - fibh->soffset; - - memcpy((uint8_t *)cfi, (uint8_t *)fi, + memcpy(&iter->fi, iinfo->i_data + iinfo->i_lenEAttr + iter->pos, sizeof(struct fileIdentDesc)); - - return fi; + err = udf_verify_fi(iter); + if (err < 0) + return err; + iter->name = iinfo->i_data + iinfo->i_lenEAttr + iter->pos + + sizeof(struct fileIdentDesc) + + le16_to_cpu(iter->fi.lengthOfImpUse); + return 0; } - if (fibh->eoffset == dir->i_sb->s_blocksize) { - uint32_t lextoffset = epos->offset; - unsigned char blocksize_bits = dir->i_sb->s_blocksize_bits; - - if (udf_next_aext(dir, epos, eloc, elen, 1) != - (EXT_RECORDED_ALLOCATED >> 30)) - return NULL; + off = iter->pos & (blksize - 1); + len = min_t(int, sizeof(struct fileIdentDesc), blksize - off); + memcpy(&iter->fi, iter->bh[0]->b_data + off, len); + if (len < sizeof(struct fileIdentDesc)) + memcpy((char *)(&iter->fi) + len, iter->bh[1]->b_data, + sizeof(struct fileIdentDesc) - len); + err = udf_verify_fi(iter); + if (err < 0) + return err; + + /* Handle directory entry name */ + nameoff = off + sizeof(struct fileIdentDesc) + + le16_to_cpu(iter->fi.lengthOfImpUse); + if (off + udf_dir_entry_len(&iter->fi) <= blksize) { + iter->name = iter->bh[0]->b_data + nameoff; + } else if (nameoff >= blksize) { + iter->name = iter->bh[1]->b_data + (nameoff - blksize); + } else { + iter->name = iter->namebuf; + len = blksize - nameoff; + memcpy(iter->name, iter->bh[0]->b_data + nameoff, len); + memcpy(iter->name + len, iter->bh[1]->b_data, + iter->fi.lengthFileIdent - len); + } + return 0; +} - block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); +/* Readahead 8k once we are at 8k boundary */ +static void udf_readahead_dir(struct udf_fileident_iter *iter) +{ + unsigned int ralen = 16 >> (iter->dir->i_blkbits - 9); + struct buffer_head *tmp, *bha[16]; + int i, num; + udf_pblk_t blk; + + if (iter->loffset & (ralen - 1)) + return; + + if (iter->loffset + ralen > (iter->elen >> iter->dir->i_blkbits)) + ralen = (iter->elen >> iter->dir->i_blkbits) - iter->loffset; + num = 0; + for (i = 0; i < ralen; i++) { + blk = udf_get_lb_pblock(iter->dir->i_sb, &iter->eloc, + iter->loffset + i); + tmp = sb_getblk(iter->dir->i_sb, blk); + if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) + bha[num++] = tmp; + else + brelse(tmp); + } + if (num) { + bh_readahead_batch(num, bha, REQ_RAHEAD); + for (i = 0; i < num; i++) + brelse(bha[i]); + } +} - (*offset)++; +static struct buffer_head *udf_fiiter_bread_blk(struct udf_fileident_iter *iter) +{ + udf_pblk_t blk; - if ((*offset << blocksize_bits) >= *elen) - *offset = 0; - else - epos->offset = lextoffset; + udf_readahead_dir(iter); + blk = udf_get_lb_pblock(iter->dir->i_sb, &iter->eloc, iter->loffset); + return sb_bread(iter->dir->i_sb, blk); +} - brelse(fibh->sbh); - fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); - if (!fibh->sbh) - return NULL; - fibh->soffset = fibh->eoffset = 0; - - if (!(*offset & ((16 >> (blocksize_bits - 9)) - 1))) { - i = 16 >> (blocksize_bits - 9); - if (i + *offset > (*elen >> blocksize_bits)) - i = (*elen >> blocksize_bits)-*offset; - for (num = 0; i > 0; i--) { - block = udf_get_lb_pblock(dir->i_sb, eloc, - *offset + i); - tmp = udf_tgetblk(dir->i_sb, block); - if (tmp && !buffer_uptodate(tmp) && - !buffer_locked(tmp)) - bha[num++] = tmp; - else - brelse(tmp); - } - if (num) { - bh_readahead_batch(num, bha, REQ_RAHEAD); - for (i = 0; i < num; i++) - brelse(bha[i]); - } +/* + * Updates loffset to point to next directory block; eloc, elen & epos are + * updated if we need to traverse to the next extent as well. + */ +static int udf_fiiter_advance_blk(struct udf_fileident_iter *iter) +{ + iter->loffset++; + if (iter->loffset < DIV_ROUND_UP(iter->elen, 1<<iter->dir->i_blkbits)) + return 0; + + iter->loffset = 0; + if (udf_next_aext(iter->dir, &iter->epos, &iter->eloc, &iter->elen, 1) + != (EXT_RECORDED_ALLOCATED >> 30)) { + if (iter->pos == iter->dir->i_size) { + iter->elen = 0; + return 0; } - } else if (fibh->sbh != fibh->ebh) { - brelse(fibh->sbh); - fibh->sbh = fibh->ebh; + udf_err(iter->dir->i_sb, + "extent after position %llu not allocated in directory (ino %lu)\n", + (unsigned long long)iter->pos, iter->dir->i_ino); + return -EFSCORRUPTED; } + return 0; +} - fi = udf_get_fileident(fibh->sbh->b_data, dir->i_sb->s_blocksize, - &(fibh->eoffset)); - - if (!fi) - return NULL; +static int udf_fiiter_load_bhs(struct udf_fileident_iter *iter) +{ + int blksize = 1 << iter->dir->i_blkbits; + int off = iter->pos & (blksize - 1); + int err; + struct fileIdentDesc *fi; - *nf_pos += fibh->eoffset - fibh->soffset; + /* Is there any further extent we can map from? */ + if (!iter->bh[0] && iter->elen) { + iter->bh[0] = udf_fiiter_bread_blk(iter); + if (!iter->bh[0]) { + err = -ENOMEM; + goto out_brelse; + } + if (!buffer_uptodate(iter->bh[0])) { + err = -EIO; + goto out_brelse; + } + } + /* There's no next block so we are done */ + if (iter->pos >= iter->dir->i_size) + return 0; + /* Need to fetch next block as well? */ + if (off + sizeof(struct fileIdentDesc) > blksize) + goto fetch_next; + fi = (struct fileIdentDesc *)(iter->bh[0]->b_data + off); + /* Need to fetch next block to get name? */ + if (off + udf_dir_entry_len(fi) > blksize) { +fetch_next: + err = udf_fiiter_advance_blk(iter); + if (err) + goto out_brelse; + iter->bh[1] = udf_fiiter_bread_blk(iter); + if (!iter->bh[1]) { + err = -ENOMEM; + goto out_brelse; + } + if (!buffer_uptodate(iter->bh[1])) { + err = -EIO; + goto out_brelse; + } + } + return 0; +out_brelse: + brelse(iter->bh[0]); + brelse(iter->bh[1]); + iter->bh[0] = iter->bh[1] = NULL; + return err; +} - if (fibh->eoffset <= dir->i_sb->s_blocksize) { - memcpy((uint8_t *)cfi, (uint8_t *)fi, - sizeof(struct fileIdentDesc)); - } else if (fibh->eoffset > dir->i_sb->s_blocksize) { - uint32_t lextoffset = epos->offset; +int udf_fiiter_init(struct udf_fileident_iter *iter, struct inode *dir, + loff_t pos) +{ + struct udf_inode_info *iinfo = UDF_I(dir); + int err = 0; + + iter->dir = dir; + iter->bh[0] = iter->bh[1] = NULL; + iter->pos = pos; + iter->elen = 0; + iter->epos.bh = NULL; + iter->name = NULL; + /* + * When directory is verified, we don't expect directory iteration to + * fail and it can be difficult to undo without corrupting filesystem. + * So just do not allow memory allocation failures here. + */ + iter->namebuf = kmalloc(UDF_NAME_LEN_CS0, GFP_KERNEL | __GFP_NOFAIL); - if (udf_next_aext(dir, epos, eloc, elen, 1) != - (EXT_RECORDED_ALLOCATED >> 30)) - return NULL; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + err = udf_copy_fi(iter); + goto out; + } - block = udf_get_lb_pblock(dir->i_sb, eloc, *offset); + if (inode_bmap(dir, iter->pos >> dir->i_blkbits, &iter->epos, + &iter->eloc, &iter->elen, &iter->loffset) != + (EXT_RECORDED_ALLOCATED >> 30)) { + if (pos == dir->i_size) + return 0; + udf_err(dir->i_sb, + "position %llu not allocated in directory (ino %lu)\n", + (unsigned long long)pos, dir->i_ino); + err = -EFSCORRUPTED; + goto out; + } + err = udf_fiiter_load_bhs(iter); + if (err < 0) + goto out; + err = udf_copy_fi(iter); +out: + if (err < 0) + udf_fiiter_release(iter); + return err; +} - (*offset)++; +int udf_fiiter_advance(struct udf_fileident_iter *iter) +{ + unsigned int oldoff, len; + int blksize = 1 << iter->dir->i_blkbits; + int err; + + oldoff = iter->pos & (blksize - 1); + len = udf_dir_entry_len(&iter->fi); + iter->pos += len; + if (UDF_I(iter->dir)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + if (oldoff + len >= blksize) { + brelse(iter->bh[0]); + iter->bh[0] = NULL; + /* Next block already loaded? */ + if (iter->bh[1]) { + iter->bh[0] = iter->bh[1]; + iter->bh[1] = NULL; + } else { + err = udf_fiiter_advance_blk(iter); + if (err < 0) + return err; + } + } + err = udf_fiiter_load_bhs(iter); + if (err < 0) + return err; + } + return udf_copy_fi(iter); +} - if ((*offset << dir->i_sb->s_blocksize_bits) >= *elen) - *offset = 0; - else - epos->offset = lextoffset; +void udf_fiiter_release(struct udf_fileident_iter *iter) +{ + iter->dir = NULL; + brelse(iter->bh[0]); + brelse(iter->bh[1]); + iter->bh[0] = iter->bh[1] = NULL; + kfree(iter->namebuf); + iter->namebuf = NULL; +} - fibh->soffset -= dir->i_sb->s_blocksize; - fibh->eoffset -= dir->i_sb->s_blocksize; +static void udf_copy_to_bufs(void *buf1, int len1, void *buf2, int len2, + int off, void *src, int len) +{ + int copy; + + if (off >= len1) { + off -= len1; + } else { + copy = min(off + len, len1) - off; + memcpy(buf1 + off, src, copy); + src += copy; + len -= copy; + off = 0; + } + if (len > 0) { + if (WARN_ON_ONCE(off + len > len2 || !buf2)) + return; + memcpy(buf2 + off, src, len); + } +} - fibh->ebh = udf_tread(dir->i_sb, block); - if (!fibh->ebh) - return NULL; +static uint16_t udf_crc_fi_bufs(void *buf1, int len1, void *buf2, int len2, + int off, int len) +{ + int copy; + uint16_t crc = 0; + + if (off >= len1) { + off -= len1; + } else { + copy = min(off + len, len1) - off; + crc = crc_itu_t(crc, buf1 + off, copy); + len -= copy; + off = 0; + } + if (len > 0) { + if (WARN_ON_ONCE(off + len > len2 || !buf2)) + return 0; + crc = crc_itu_t(crc, buf2 + off, len); + } + return crc; +} - if (sizeof(struct fileIdentDesc) > -fibh->soffset) { - int fi_len; +static void udf_copy_fi_to_bufs(char *buf1, int len1, char *buf2, int len2, + int off, struct fileIdentDesc *fi, + uint8_t *impuse, uint8_t *name) +{ + uint16_t crc; + int fioff = off; + int crcoff = off + sizeof(struct tag); + unsigned int crclen = udf_dir_entry_len(fi) - sizeof(struct tag); + char zeros[UDF_NAME_PAD] = {}; + int endoff = off + udf_dir_entry_len(fi); + + udf_copy_to_bufs(buf1, len1, buf2, len2, off, fi, + sizeof(struct fileIdentDesc)); + off += sizeof(struct fileIdentDesc); + if (impuse) + udf_copy_to_bufs(buf1, len1, buf2, len2, off, impuse, + le16_to_cpu(fi->lengthOfImpUse)); + off += le16_to_cpu(fi->lengthOfImpUse); + if (name) { + udf_copy_to_bufs(buf1, len1, buf2, len2, off, name, + fi->lengthFileIdent); + off += fi->lengthFileIdent; + udf_copy_to_bufs(buf1, len1, buf2, len2, off, zeros, + endoff - off); + } - memcpy((uint8_t *)cfi, (uint8_t *)fi, -fibh->soffset); - memcpy((uint8_t *)cfi - fibh->soffset, - fibh->ebh->b_data, - sizeof(struct fileIdentDesc) + fibh->soffset); + crc = udf_crc_fi_bufs(buf1, len1, buf2, len2, crcoff, crclen); + fi->descTag.descCRC = cpu_to_le16(crc); + fi->descTag.descCRCLength = cpu_to_le16(crclen); + fi->descTag.tagChecksum = udf_tag_checksum(&fi->descTag); - fi_len = udf_dir_entry_len(cfi); - *nf_pos += fi_len - (fibh->eoffset - fibh->soffset); - fibh->eoffset = fibh->soffset + fi_len; - } else { - memcpy((uint8_t *)cfi, (uint8_t *)fi, - sizeof(struct fileIdentDesc)); - } - } - /* Got last entry outside of dir size - fs is corrupted! */ - if (*nf_pos > dir->i_size) - return NULL; - return fi; + udf_copy_to_bufs(buf1, len1, buf2, len2, fioff, fi, sizeof(struct tag)); } -struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) +void udf_fiiter_write_fi(struct udf_fileident_iter *iter, uint8_t *impuse) { - struct fileIdentDesc *fi; - int lengthThisIdent; - uint8_t *ptr; - int padlen; + struct udf_inode_info *iinfo = UDF_I(iter->dir); + void *buf1, *buf2 = NULL; + int len1, len2 = 0, off; + int blksize = 1 << iter->dir->i_blkbits; - if ((!buffer) || (!offset)) { - udf_debug("invalidparms, buffer=%p, offset=%p\n", - buffer, offset); - return NULL; + off = iter->pos & (blksize - 1); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + buf1 = iinfo->i_data + iinfo->i_lenEAttr; + len1 = iter->dir->i_size; + } else { + buf1 = iter->bh[0]->b_data; + len1 = blksize; + if (iter->bh[1]) { + buf2 = iter->bh[1]->b_data; + len2 = blksize; + } } - ptr = buffer; + udf_copy_fi_to_bufs(buf1, len1, buf2, len2, off, &iter->fi, impuse, + iter->name == iter->namebuf ? iter->name : NULL); - if ((*offset > 0) && (*offset < bufsize)) - ptr += *offset; - fi = (struct fileIdentDesc *)ptr; - if (fi->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) { - udf_debug("0x%x != TAG_IDENT_FID\n", - le16_to_cpu(fi->descTag.tagIdent)); - udf_debug("offset: %d sizeof: %lu bufsize: %d\n", - *offset, (unsigned long)sizeof(struct fileIdentDesc), - bufsize); - return NULL; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + mark_inode_dirty(iter->dir); + } else { + mark_buffer_dirty_inode(iter->bh[0], iter->dir); + if (iter->bh[1]) + mark_buffer_dirty_inode(iter->bh[1], iter->dir); } - if ((*offset + sizeof(struct fileIdentDesc)) > bufsize) - lengthThisIdent = sizeof(struct fileIdentDesc); - else - lengthThisIdent = sizeof(struct fileIdentDesc) + - fi->lengthFileIdent + le16_to_cpu(fi->lengthOfImpUse); + inode_inc_iversion(iter->dir); +} - /* we need to figure padding, too! */ - padlen = lengthThisIdent % UDF_NAME_PAD; - if (padlen) - lengthThisIdent += (UDF_NAME_PAD - padlen); - *offset = *offset + lengthThisIdent; +void udf_fiiter_update_elen(struct udf_fileident_iter *iter, uint32_t new_elen) +{ + struct udf_inode_info *iinfo = UDF_I(iter->dir); + int diff = new_elen - iter->elen; + + /* Skip update when we already went past the last extent */ + if (!iter->elen) + return; + iter->elen = new_elen; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + iter->epos.offset -= sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + iter->epos.offset -= sizeof(struct long_ad); + udf_write_aext(iter->dir, &iter->epos, &iter->eloc, iter->elen, 1); + iinfo->i_lenExtents += diff; + mark_inode_dirty(iter->dir); +} - return fi; +/* Append new block to directory. @iter is expected to point at EOF */ +int udf_fiiter_append_blk(struct udf_fileident_iter *iter) +{ + struct udf_inode_info *iinfo = UDF_I(iter->dir); + int blksize = 1 << iter->dir->i_blkbits; + struct buffer_head *bh; + sector_t block; + uint32_t old_elen = iter->elen; + int err; + + if (WARN_ON_ONCE(iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)) + return -EINVAL; + + /* Round up last extent in the file */ + udf_fiiter_update_elen(iter, ALIGN(iter->elen, blksize)); + + /* Allocate new block and refresh mapping information */ + block = iinfo->i_lenExtents >> iter->dir->i_blkbits; + bh = udf_bread(iter->dir, block, 1, &err); + if (!bh) { + udf_fiiter_update_elen(iter, old_elen); + return err; + } + if (inode_bmap(iter->dir, block, &iter->epos, &iter->eloc, &iter->elen, + &iter->loffset) != (EXT_RECORDED_ALLOCATED >> 30)) { + udf_err(iter->dir->i_sb, + "block %llu not allocated in directory (ino %lu)\n", + (unsigned long long)block, iter->dir->i_ino); + return -EFSCORRUPTED; + } + if (!(iter->pos & (blksize - 1))) { + brelse(iter->bh[0]); + iter->bh[0] = bh; + } else { + iter->bh[1] = bh; + } + return 0; } struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, diff --git a/fs/udf/file.c b/fs/udf/file.c index 5c659e23e578..8238f742377b 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -38,100 +38,55 @@ #include "udf_i.h" #include "udf_sb.h" -static void __udf_adinicb_readpage(struct page *page) +static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) { - struct inode *inode = page->mapping->host; - char *kaddr; - struct udf_inode_info *iinfo = UDF_I(inode); - loff_t isize = i_size_read(inode); - - /* - * We have to be careful here as truncate can change i_size under us. - * So just sample it once and use the same value everywhere. - */ - kaddr = kmap_atomic(page); - memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize); - memset(kaddr + isize, 0, PAGE_SIZE - isize); - flush_dcache_page(page); - SetPageUptodate(page); - kunmap_atomic(kaddr); -} - -static int udf_adinicb_read_folio(struct file *file, struct folio *folio) -{ - BUG_ON(!folio_test_locked(folio)); - __udf_adinicb_readpage(&folio->page); - folio_unlock(folio); - - return 0; -} - -static int udf_adinicb_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - char *kaddr; - struct udf_inode_info *iinfo = UDF_I(inode); - - BUG_ON(!PageLocked(page)); - - kaddr = kmap_atomic(page); - memcpy(iinfo->i_data + iinfo->i_lenEAttr, kaddr, i_size_read(inode)); - SetPageUptodate(page); - kunmap_atomic(kaddr); - mark_inode_dirty(inode); - unlock_page(page); - - return 0; -} - -static int udf_adinicb_write_begin(struct file *file, - struct address_space *mapping, loff_t pos, - unsigned len, struct page **pagep, - void **fsdata) -{ - struct page *page; - - if (WARN_ON_ONCE(pos >= PAGE_SIZE)) - return -EIO; - page = grab_cache_page_write_begin(mapping, 0); - if (!page) - return -ENOMEM; - *pagep = page; - - if (!PageUptodate(page)) - __udf_adinicb_readpage(page); - return 0; -} - -static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - /* Fallback to buffered I/O. */ - return 0; -} + struct vm_area_struct *vma = vmf->vma; + struct inode *inode = file_inode(vma->vm_file); + struct address_space *mapping = inode->i_mapping; + struct page *page = vmf->page; + loff_t size; + unsigned int end; + vm_fault_t ret = VM_FAULT_LOCKED; + int err; -static int udf_adinicb_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - loff_t last_pos = pos + copied; - if (last_pos > inode->i_size) - i_size_write(inode, last_pos); + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + filemap_invalidate_lock_shared(mapping); + lock_page(page); + size = i_size_read(inode); + if (page->mapping != inode->i_mapping || page_offset(page) >= size) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out_unlock; + } + /* Space is already allocated for in-ICB file */ + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + goto out_dirty; + if (page->index == size >> PAGE_SHIFT) + end = size & ~PAGE_MASK; + else + end = PAGE_SIZE; + err = __block_write_begin(page, 0, end, udf_get_block); + if (!err) + err = block_commit_write(page, 0, end); + if (err < 0) { + unlock_page(page); + ret = block_page_mkwrite_return(err); + goto out_unlock; + } +out_dirty: set_page_dirty(page); - unlock_page(page); - put_page(page); - return copied; + wait_for_stable_page(page); +out_unlock: + filemap_invalidate_unlock_shared(mapping); + sb_end_pagefault(inode->i_sb); + return ret; } -const struct address_space_operations udf_adinicb_aops = { - .dirty_folio = block_dirty_folio, - .invalidate_folio = block_invalidate_folio, - .read_folio = udf_adinicb_read_folio, - .writepage = udf_adinicb_writepage, - .write_begin = udf_adinicb_write_begin, - .write_end = udf_adinicb_write_end, - .direct_IO = udf_adinicb_direct_IO, +static const struct vm_operations_struct udf_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = udf_page_mkwrite, }; static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -140,7 +95,6 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct udf_inode_info *iinfo = UDF_I(inode); - int err; inode_lock(inode); @@ -148,27 +102,23 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (retval <= 0) goto out; - down_write(&iinfo->i_data_sem); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - loff_t end = iocb->ki_pos + iov_iter_count(from); - - if (inode->i_sb->s_blocksize < - (udf_file_entry_alloc_offset(inode) + end)) { - err = udf_expand_file_adinicb(inode); - if (err) { - inode_unlock(inode); - udf_debug("udf_expand_adinicb: err=%d\n", err); - return err; - } - } else { - iinfo->i_lenAlloc = max(end, inode->i_size); - up_write(&iinfo->i_data_sem); - } - } else - up_write(&iinfo->i_data_sem); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && + inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) + + iocb->ki_pos + iov_iter_count(from))) { + filemap_invalidate_lock(inode->i_mapping); + retval = udf_expand_file_adinicb(inode); + filemap_invalidate_unlock(inode->i_mapping); + if (retval) + goto out; + } retval = __generic_file_write_iter(iocb, from); out: + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && retval > 0) { + down_write(&iinfo->i_data_sem); + iinfo->i_lenAlloc = inode->i_size; + up_write(&iinfo->i_data_sem); + } inode_unlock(inode); if (retval > 0) { @@ -243,11 +193,19 @@ static int udf_release_file(struct inode *inode, struct file *filp) return 0; } +static int udf_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &udf_file_vm_ops; + + return 0; +} + const struct file_operations udf_file_operations = { .read_iter = generic_file_read_iter, .unlocked_ioctl = udf_ioctl, .open = generic_file_open, - .mmap = generic_file_mmap, + .mmap = udf_file_mmap, .write_iter = udf_file_write_iter, .release = udf_release_file, .fsync = generic_file_fsync, @@ -256,14 +214,14 @@ const struct file_operations udf_file_operations = { .llseek = generic_file_llseek, }; -static int udf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +static int udf_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -286,7 +244,7 @@ static int udf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (attr->ia_valid & ATTR_MODE) udf_update_extra_perms(inode, attr->ia_mode); - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index b5d611cee749..8d50121778a5 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -28,21 +28,7 @@ void udf_free_inode(struct inode *inode) { - struct super_block *sb = inode->i_sb; - struct udf_sb_info *sbi = UDF_SB(sb); - struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); - - if (lvidiu) { - mutex_lock(&sbi->s_alloc_mutex); - if (S_ISDIR(inode->i_mode)) - le32_add_cpu(&lvidiu->numDirs, -1); - else - le32_add_cpu(&lvidiu->numFiles, -1); - udf_updated_lvid(sb); - mutex_unlock(&sbi->s_alloc_mutex); - } - - udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); + udf_free_blocks(inode->i_sb, NULL, &UDF_I(inode)->i_location, 0, 1); } struct inode *udf_new_inode(struct inode *dir, umode_t mode) @@ -54,7 +40,6 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); - struct logicalVolIntegrityDescImpUse *lvidiu; int err; inode = new_inode(sb); @@ -92,20 +77,10 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) return ERR_PTR(err); } - lvidiu = udf_sb_lvidiu(sb); - if (lvidiu) { - iinfo->i_unique = lvid_get_unique_id(sb); - inode->i_generation = iinfo->i_unique; - mutex_lock(&sbi->s_alloc_mutex); - if (S_ISDIR(mode)) - le32_add_cpu(&lvidiu->numDirs, 1); - else - le32_add_cpu(&lvidiu->numFiles, 1); - udf_updated_lvid(sb); - mutex_unlock(&sbi->s_alloc_mutex); - } + iinfo->i_unique = lvid_get_unique_id(sb); + inode->i_generation = iinfo->i_unique; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) inode->i_uid = sbi->s_uid; if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 34e416327dd4..f7a9607c2b95 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -52,21 +52,24 @@ #define FE_DELETE_PERMS (FE_PERM_U_DELETE | FE_PERM_G_DELETE | \ FE_PERM_O_DELETE) +struct udf_map_rq; + static umode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); -static sector_t inode_getblk(struct inode *, sector_t, int *, int *); -static int8_t udf_insert_aext(struct inode *, struct extent_position, - struct kernel_lb_addr, uint32_t); +static int inode_getblk(struct inode *inode, struct udf_map_rq *map); +static int udf_insert_aext(struct inode *, struct extent_position, + struct kernel_lb_addr, uint32_t); static void udf_split_extents(struct inode *, int *, int, udf_pblk_t, struct kernel_long_ad *, int *); static void udf_prealloc_extents(struct inode *, int, int, struct kernel_long_ad *, int *); static void udf_merge_extents(struct inode *, struct kernel_long_ad *, int *); -static void udf_update_extents(struct inode *, struct kernel_long_ad *, int, - int, struct extent_position *); -static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); +static int udf_update_extents(struct inode *, struct kernel_long_ad *, int, + int, struct extent_position *); +static int udf_get_block_wb(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create); static void __udf_clear_extent_cache(struct inode *inode) { @@ -182,14 +185,57 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) } } +static int udf_adinicb_writepage(struct folio *folio, + struct writeback_control *wbc, void *data) +{ + struct page *page = &folio->page; + struct inode *inode = page->mapping->host; + struct udf_inode_info *iinfo = UDF_I(inode); + + BUG_ON(!PageLocked(page)); + memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr, + i_size_read(inode)); + unlock_page(page); + mark_inode_dirty(inode); + + return 0; +} + static int udf_writepages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc) { - return mpage_writepages(mapping, wbc, udf_get_block); + struct inode *inode = mapping->host; + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) + return mpage_writepages(mapping, wbc, udf_get_block_wb); + return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL); +} + +static void udf_adinicb_readpage(struct page *page) +{ + struct inode *inode = page->mapping->host; + char *kaddr; + struct udf_inode_info *iinfo = UDF_I(inode); + loff_t isize = i_size_read(inode); + + kaddr = kmap_local_page(page); + memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize); + memset(kaddr + isize, 0, PAGE_SIZE - isize); + flush_dcache_page(page); + SetPageUptodate(page); + kunmap_local(kaddr); } static int udf_read_folio(struct file *file, struct folio *folio) { + struct udf_inode_info *iinfo = UDF_I(file_inode(file)); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + udf_adinicb_readpage(&folio->page); + folio_unlock(folio); + return 0; + } return mpage_read_folio(folio, udf_get_block); } @@ -199,15 +245,49 @@ static void udf_readahead(struct readahead_control *rac) } static int udf_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, + struct page **pagep, void **fsdata) { + struct udf_inode_info *iinfo = UDF_I(file_inode(file)); + struct page *page; int ret; - ret = block_write_begin(mapping, pos, len, pagep, udf_get_block); - if (unlikely(ret)) - udf_write_failed(mapping, pos + len); - return ret; + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + ret = block_write_begin(mapping, pos, len, pagep, + udf_get_block); + if (unlikely(ret)) + udf_write_failed(mapping, pos + len); + return ret; + } + if (WARN_ON_ONCE(pos >= PAGE_SIZE)) + return -EIO; + page = grab_cache_page_write_begin(mapping, 0); + if (!page) + return -ENOMEM; + *pagep = page; + if (!PageUptodate(page)) + udf_adinicb_readpage(page); + return 0; +} + +static int udf_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = file_inode(file); + loff_t last_pos; + + if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) + return generic_write_end(file, mapping, pos, len, copied, page, + fsdata); + last_pos = pos + copied; + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + set_page_dirty(page); + unlock_page(page); + put_page(page); + + return copied; } static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) @@ -218,6 +298,9 @@ static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) size_t count = iov_iter_count(iter); ssize_t ret; + /* Fallback to buffered IO for in-ICB files */ + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + return 0; ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block); if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE)) udf_write_failed(mapping, iocb->ki_pos + count); @@ -226,6 +309,10 @@ static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static sector_t udf_bmap(struct address_space *mapping, sector_t block) { + struct udf_inode_info *iinfo = UDF_I(mapping->host); + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + return -EINVAL; return generic_block_bmap(mapping, block, udf_get_block); } @@ -236,7 +323,7 @@ const struct address_space_operations udf_aops = { .readahead = udf_readahead, .writepages = udf_writepages, .write_begin = udf_write_begin, - .write_end = generic_write_end, + .write_end = udf_write_end, .direct_IO = udf_direct_IO, .bmap = udf_bmap, .migrate_folio = buffer_migrate_folio, @@ -245,18 +332,17 @@ const struct address_space_operations udf_aops = { /* * Expand file stored in ICB to a normal one-block-file * - * This function requires i_data_sem for writing and releases it. * This function requires i_mutex held */ int udf_expand_file_adinicb(struct inode *inode) { struct page *page; - char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); int err; WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { + down_write(&iinfo->i_data_sem); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else @@ -267,26 +353,13 @@ int udf_expand_file_adinicb(struct inode *inode) mark_inode_dirty(inode); return 0; } - /* - * Release i_data_sem so that we can lock a page - page lock ranks - * above i_data_sem. i_mutex still protects us against file changes. - */ - up_write(&iinfo->i_data_sem); page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); if (!page) return -ENOMEM; - if (!PageUptodate(page)) { - kaddr = kmap_atomic(page); - memset(kaddr + iinfo->i_lenAlloc, 0x00, - PAGE_SIZE - iinfo->i_lenAlloc); - memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, - iinfo->i_lenAlloc); - flush_dcache_page(page); - SetPageUptodate(page); - kunmap_atomic(kaddr); - } + if (!PageUptodate(page)) + udf_adinicb_readpage(page); down_write(&iinfo->i_data_sem); memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); @@ -295,8 +368,6 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - /* from now on we have normal address_space methods */ - inode->i_data.a_ops = &udf_aops; set_page_dirty(page); unlock_page(page); up_write(&iinfo->i_data_sem); @@ -305,12 +376,10 @@ int udf_expand_file_adinicb(struct inode *inode) /* Restore everything back so that we don't lose data... */ lock_page(page); down_write(&iinfo->i_data_sem); - kaddr = kmap_atomic(page); - memcpy(iinfo->i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); - kunmap_atomic(kaddr); + memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr, + inode->i_size); unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; - inode->i_data.a_ops = &udf_adinicb_aops; iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } @@ -320,162 +389,103 @@ int udf_expand_file_adinicb(struct inode *inode) return err; } -struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, - udf_pblk_t *block, int *err) -{ - udf_pblk_t newblock; - struct buffer_head *dbh = NULL; - struct kernel_lb_addr eloc; - uint8_t alloctype; - struct extent_position epos; +#define UDF_MAP_CREATE 0x01 /* Mapping can allocate new blocks */ +#define UDF_MAP_NOPREALLOC 0x02 /* Do not preallocate blocks */ - struct udf_fileident_bh sfibh, dfibh; - loff_t f_pos = udf_ext0_offset(inode); - int size = udf_ext0_offset(inode) + inode->i_size; - struct fileIdentDesc cfi, *sfi, *dfi; - struct udf_inode_info *iinfo = UDF_I(inode); +#define UDF_BLK_MAPPED 0x01 /* Block was successfully mapped */ +#define UDF_BLK_NEW 0x02 /* Block was freshly allocated */ - if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) - alloctype = ICBTAG_FLAG_AD_SHORT; - else - alloctype = ICBTAG_FLAG_AD_LONG; +struct udf_map_rq { + sector_t lblk; + udf_pblk_t pblk; + int iflags; /* UDF_MAP_ flags determining behavior */ + int oflags; /* UDF_BLK_ flags reporting results */ +}; - if (!inode->i_size) { - iinfo->i_alloc_type = alloctype; - mark_inode_dirty(inode); - return NULL; - } +static int udf_map_block(struct inode *inode, struct udf_map_rq *map) +{ + int err; + struct udf_inode_info *iinfo = UDF_I(inode); - /* alloc block, and copy data to it */ - *block = udf_new_block(inode->i_sb, inode, - iinfo->i_location.partitionReferenceNum, - iinfo->i_location.logicalBlockNum, err); - if (!(*block)) - return NULL; - newblock = udf_get_pblock(inode->i_sb, *block, - iinfo->i_location.partitionReferenceNum, - 0); - if (!newblock) - return NULL; - dbh = udf_tgetblk(inode->i_sb, newblock); - if (!dbh) - return NULL; - lock_buffer(dbh); - memset(dbh->b_data, 0x00, inode->i_sb->s_blocksize); - set_buffer_uptodate(dbh); - unlock_buffer(dbh); - mark_buffer_dirty_inode(dbh, inode); - - sfibh.soffset = sfibh.eoffset = - f_pos & (inode->i_sb->s_blocksize - 1); - sfibh.sbh = sfibh.ebh = NULL; - dfibh.soffset = dfibh.eoffset = 0; - dfibh.sbh = dfibh.ebh = dbh; - while (f_pos < size) { - iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; - sfi = udf_fileident_read(inode, &f_pos, &sfibh, &cfi, NULL, - NULL, NULL, NULL); - if (!sfi) { - brelse(dbh); - return NULL; - } - iinfo->i_alloc_type = alloctype; - sfi->descTag.tagLocation = cpu_to_le32(*block); - dfibh.soffset = dfibh.eoffset; - dfibh.eoffset += (sfibh.eoffset - sfibh.soffset); - dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset); - if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse, - udf_get_fi_ident(sfi))) { - iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; - brelse(dbh); - return NULL; + map->oflags = 0; + if (!(map->iflags & UDF_MAP_CREATE)) { + struct kernel_lb_addr eloc; + uint32_t elen; + sector_t offset; + struct extent_position epos = {}; + + down_read(&iinfo->i_data_sem); + if (inode_bmap(inode, map->lblk, &epos, &eloc, &elen, &offset) + == (EXT_RECORDED_ALLOCATED >> 30)) { + map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, + offset); + map->oflags |= UDF_BLK_MAPPED; } - } - mark_buffer_dirty_inode(dbh, inode); + up_read(&iinfo->i_data_sem); + brelse(epos.bh); - memset(iinfo->i_data + iinfo->i_lenEAttr, 0, iinfo->i_lenAlloc); - iinfo->i_lenAlloc = 0; - eloc.logicalBlockNum = *block; - eloc.partitionReferenceNum = - iinfo->i_location.partitionReferenceNum; - iinfo->i_lenExtents = inode->i_size; - epos.bh = NULL; - epos.block = iinfo->i_location; - epos.offset = udf_file_entry_alloc_offset(inode); - udf_add_aext(inode, &epos, &eloc, inode->i_size, 0); - /* UniqueID stuff */ - - brelse(epos.bh); - mark_inode_dirty(inode); - return dbh; -} - -static int udf_get_block(struct inode *inode, sector_t block, - struct buffer_head *bh_result, int create) -{ - int err, new; - sector_t phys = 0; - struct udf_inode_info *iinfo; - - if (!create) { - phys = udf_block_map(inode, block); - if (phys) - map_bh(bh_result, inode->i_sb, phys); return 0; } - err = -EIO; - new = 0; - iinfo = UDF_I(inode); - down_write(&iinfo->i_data_sem); - if (block == iinfo->i_next_alloc_block + 1) { - iinfo->i_next_alloc_block++; - iinfo->i_next_alloc_goal++; - } - /* * Block beyond EOF and prealloc extents? Just discard preallocation * as it is not useful and complicates things. */ - if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents) + if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents) udf_discard_prealloc(inode); udf_clear_extent_cache(inode); - phys = inode_getblk(inode, block, &err, &new); - if (!phys) - goto abort; - - if (new) - set_buffer_new(bh_result); - map_bh(bh_result, inode->i_sb, phys); - -abort: + err = inode_getblk(inode, map); up_write(&iinfo->i_data_sem); return err; } -static struct buffer_head *udf_getblk(struct inode *inode, udf_pblk_t block, - int create, int *err) +static int __udf_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int flags) { - struct buffer_head *bh; - struct buffer_head dummy; - - dummy.b_state = 0; - dummy.b_blocknr = -1000; - *err = udf_get_block(inode, block, &dummy, create); - if (!*err && buffer_mapped(&dummy)) { - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (buffer_new(&dummy)) { - lock_buffer(bh); - memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); - set_buffer_uptodate(bh); - unlock_buffer(bh); - mark_buffer_dirty_inode(bh, inode); - } - return bh; + int err; + struct udf_map_rq map = { + .lblk = block, + .iflags = flags, + }; + + err = udf_map_block(inode, &map); + if (err < 0) + return err; + if (map.oflags & UDF_BLK_MAPPED) { + map_bh(bh_result, inode->i_sb, map.pblk); + if (map.oflags & UDF_BLK_NEW) + set_buffer_new(bh_result); } + return 0; +} - return NULL; +int udf_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + int flags = create ? UDF_MAP_CREATE : 0; + + /* + * We preallocate blocks only for regular files. It also makes sense + * for directories but there's a problem when to drop the + * preallocation. We might use some delayed work for that but I feel + * it's overengineering for a filesystem like UDF. + */ + if (!S_ISREG(inode->i_mode)) + flags |= UDF_MAP_NOPREALLOC; + return __udf_get_block(inode, block, bh_result, flags); +} + +/* + * We shouldn't be allocating blocks on page writeback since we allocate them + * on page fault. We can spot dirty buffers without allocated blocks though + * when truncate expands file. These however don't have valid data so we can + * safely ignore them. So never allocate blocks from page writeback. + */ +static int udf_get_block_wb(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + return __udf_get_block(inode, block, bh_result, 0); } /* Extend the file with new blocks totaling 'new_block_bytes', @@ -509,6 +519,7 @@ static int udf_do_extend_file(struct inode *inode, ~(sb->s_blocksize - 1); } + add = 0; /* Can we merge with the previous extent? */ if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == EXT_NOT_RECORDED_NOT_ALLOCATED) { @@ -521,8 +532,10 @@ static int udf_do_extend_file(struct inode *inode, } if (fake) { - udf_add_aext(inode, last_pos, &last_ext->extLocation, - last_ext->extLength, 1); + err = udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + if (err < 0) + goto out_err; count++; } else { struct kernel_lb_addr tmploc; @@ -539,6 +552,7 @@ static int udf_do_extend_file(struct inode *inode, if (new_block_bytes) udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); } + iinfo->i_lenExtents += add; /* Managed to do everything necessary? */ if (!new_block_bytes) @@ -556,7 +570,8 @@ static int udf_do_extend_file(struct inode *inode, err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) - return err; + goto out_err; + iinfo->i_lenExtents += add; count++; } if (new_block_bytes) { @@ -565,7 +580,8 @@ static int udf_do_extend_file(struct inode *inode, err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) - return err; + goto out_err; + iinfo->i_lenExtents += new_block_bytes; count++; } @@ -579,6 +595,11 @@ out: return -EIO; return count; +out_err: + /* Remove extents we've created so far */ + udf_clear_extent_cache(inode); + udf_truncate_extents(inode); + return err; } /* Extend the final block of the file to final_block_len bytes */ @@ -626,6 +647,7 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) else BUG(); + down_write(&iinfo->i_data_sem); /* * When creating hole in file, just don't bother with preserving * preallocation. It likely won't be very useful anyway. @@ -668,14 +690,13 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) if (err < 0) goto out; err = 0; - iinfo->i_lenExtents = newsize; out: brelse(epos.bh); + up_write(&iinfo->i_data_sem); return err; } -static sector_t inode_getblk(struct inode *inode, sector_t block, - int *err, int *new) +static int inode_getblk(struct inode *inode, struct udf_map_rq *map) { struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; struct extent_position prev_epos, cur_epos, next_epos; @@ -684,21 +705,20 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, struct kernel_lb_addr eloc, tmpeloc; int c = 1; loff_t lbcount = 0, b_off = 0; - udf_pblk_t newblocknum, newblock = 0; + udf_pblk_t newblocknum; sector_t offset = 0; int8_t etype; struct udf_inode_info *iinfo = UDF_I(inode); udf_pblk_t goal = 0, pgoal = iinfo->i_location.logicalBlockNum; int lastblock = 0; bool isBeyondEOF; + int ret = 0; - *err = 0; - *new = 0; prev_epos.offset = udf_file_entry_alloc_offset(inode); prev_epos.block = iinfo->i_location; prev_epos.bh = NULL; cur_epos = next_epos = prev_epos; - b_off = (loff_t)block << inode->i_sb->s_blocksize_bits; + b_off = (loff_t)map->lblk << inode->i_sb->s_blocksize_bits; /* find the extent which contains the block we are looking for. alternate between laarr[0] and laarr[1] for locations of the @@ -757,15 +777,18 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, elen = EXT_RECORDED_ALLOCATED | ((elen + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); + iinfo->i_lenExtents = + ALIGN(iinfo->i_lenExtents, + inode->i_sb->s_blocksize); udf_write_aext(inode, &cur_epos, &eloc, elen, 1); } - newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset); + map->oflags = UDF_BLK_MAPPED; + map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset); goto out_free; } /* Are we beyond EOF and preallocated extent? */ if (etype == -1) { - int ret; loff_t hole_len; isBeyondEOF = true; @@ -785,26 +808,22 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, /* Create extents for the hole between EOF and offset */ hole_len = (loff_t)offset << inode->i_blkbits; ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len); - if (ret < 0) { - *err = ret; + if (ret < 0) goto out_free; - } c = 0; offset = 0; count += ret; - /* We are not covered by a preallocated extent? */ - if ((laarr[0].extLength & UDF_EXTENT_FLAG_MASK) != - EXT_NOT_RECORDED_ALLOCATED) { - /* Is there any real extent? - otherwise we overwrite - * the fake one... */ - if (count) - c = !c; - laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | - inode->i_sb->s_blocksize; - memset(&laarr[c].extLocation, 0x00, - sizeof(struct kernel_lb_addr)); - count++; - } + /* + * Is there any real extent? - otherwise we overwrite the fake + * one... + */ + if (count) + c = !c; + laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | + inode->i_sb->s_blocksize; + memset(&laarr[c].extLocation, 0x00, + sizeof(struct kernel_lb_addr)); + count++; endnum = c + 1; lastblock = 1; } else { @@ -838,7 +857,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, if ((laarr[c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) newblocknum = laarr[c].extLocation.logicalBlockNum + offset; else { /* otherwise, allocate a new block */ - if (iinfo->i_next_alloc_block == block) + if (iinfo->i_next_alloc_block == map->lblk) goal = iinfo->i_next_alloc_goal; if (!goal) { @@ -848,11 +867,9 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, newblocknum = udf_new_block(inode->i_sb, inode, iinfo->i_location.partitionReferenceNum, - goal, err); - if (!newblocknum) { - *err = -ENOSPC; + goal, &ret); + if (!newblocknum) goto out_free; - } if (isBeyondEOF) iinfo->i_lenExtents += inode->i_sb->s_blocksize; } @@ -863,11 +880,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, * block */ udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); - /* We preallocate blocks only for regular files. It also makes sense - * for directories but there's a problem when to drop the - * preallocation. We might use some delayed work for that but I feel - * it's overengineering for a filesystem like UDF. */ - if (S_ISREG(inode->i_mode)) + if (!(map->iflags & UDF_MAP_NOPREALLOC)) udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); /* merge any continuous blocks in laarr */ @@ -876,28 +889,31 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, /* write back the new extents, inserting new extents if the new number * of extents is greater than the old number, and deleting extents if * the new number of extents is less than the old number */ - udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); + ret = udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); + if (ret < 0) + goto out_free; - newblock = udf_get_pblock(inode->i_sb, newblocknum, + map->pblk = udf_get_pblock(inode->i_sb, newblocknum, iinfo->i_location.partitionReferenceNum, 0); - if (!newblock) { - *err = -EIO; + if (!map->pblk) { + ret = -EFSCORRUPTED; goto out_free; } - *new = 1; - iinfo->i_next_alloc_block = block; - iinfo->i_next_alloc_goal = newblocknum; + map->oflags = UDF_BLK_NEW | UDF_BLK_MAPPED; + iinfo->i_next_alloc_block = map->lblk + 1; + iinfo->i_next_alloc_goal = newblocknum + 1; inode->i_ctime = current_time(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); + ret = 0; out_free: brelse(prev_epos.bh); brelse(cur_epos.bh); brelse(next_epos.bh); - return newblock; + return ret; } static void udf_split_extents(struct inode *inode, int *c, int offset, @@ -1080,23 +1096,8 @@ static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr, blocksize - 1) >> blocksize_bits)))) { if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + - (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + - blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) { - lip1->extLength = (lip1->extLength - - (li->extLength & - UDF_EXTENT_LENGTH_MASK) + - UDF_EXTENT_LENGTH_MASK) & - ~(blocksize - 1); - li->extLength = (li->extLength & - UDF_EXTENT_FLAG_MASK) + - (UDF_EXTENT_LENGTH_MASK + 1) - - blocksize; - lip1->extLocation.logicalBlockNum = - li->extLocation.logicalBlockNum + - ((li->extLength & - UDF_EXTENT_LENGTH_MASK) >> - blocksize_bits); - } else { + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + + blocksize - 1) <= UDF_EXTENT_LENGTH_MASK) { li->extLength = lip1->extLength + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + @@ -1159,21 +1160,30 @@ static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr, } } -static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr, - int startnum, int endnum, - struct extent_position *epos) +static int udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr, + int startnum, int endnum, + struct extent_position *epos) { int start = 0, i; struct kernel_lb_addr tmploc; uint32_t tmplen; + int err; if (startnum > endnum) { for (i = 0; i < (startnum - endnum); i++) udf_delete_aext(inode, *epos); } else if (startnum < endnum) { for (i = 0; i < (endnum - startnum); i++) { - udf_insert_aext(inode, *epos, laarr[i].extLocation, - laarr[i].extLength); + err = udf_insert_aext(inode, *epos, + laarr[i].extLocation, + laarr[i].extLength); + /* + * If we fail here, we are likely corrupting the extent + * list and leaking blocks. At least stop early to + * limit the damage. + */ + if (err < 0) + return err; udf_next_aext(inode, epos, &laarr[i].extLocation, &laarr[i].extLength, 1); start++; @@ -1185,17 +1195,36 @@ static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr udf_write_aext(inode, epos, &laarr[i].extLocation, laarr[i].extLength, 1); } + return 0; } struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int create, int *err) { struct buffer_head *bh = NULL; + struct udf_map_rq map = { + .lblk = block, + .iflags = UDF_MAP_NOPREALLOC | (create ? UDF_MAP_CREATE : 0), + }; - bh = udf_getblk(inode, block, create, err); - if (!bh) + *err = udf_map_block(inode, &map); + if (*err || !(map.oflags & UDF_BLK_MAPPED)) return NULL; + bh = sb_getblk(inode->i_sb, map.pblk); + if (!bh) { + *err = -ENOMEM; + return NULL; + } + if (map.oflags & UDF_BLK_NEW) { + lock_buffer(bh); + memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + return bh; + } + if (bh_read(bh, 0) >= 0) return bh; @@ -1206,7 +1235,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int udf_setsize(struct inode *inode, loff_t newsize) { - int err; + int err = 0; struct udf_inode_info *iinfo; unsigned int bsize = i_blocksize(inode); @@ -1216,28 +1245,25 @@ int udf_setsize(struct inode *inode, loff_t newsize) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; + filemap_invalidate_lock(inode->i_mapping); iinfo = UDF_I(inode); if (newsize > inode->i_size) { - down_write(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - if (bsize < + if (bsize >= (udf_file_entry_alloc_offset(inode) + newsize)) { - err = udf_expand_file_adinicb(inode); - if (err) - return err; down_write(&iinfo->i_data_sem); - } else { iinfo->i_lenAlloc = newsize; + up_write(&iinfo->i_data_sem); goto set_size; } + err = udf_expand_file_adinicb(inode); + if (err) + goto out_unlock; } err = udf_extend_file(inode, newsize); - if (err) { - up_write(&iinfo->i_data_sem); - return err; - } + if (err) + goto out_unlock; set_size: - up_write(&iinfo->i_data_sem); truncate_setsize(inode, newsize); } else { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { @@ -1254,14 +1280,14 @@ set_size: err = block_truncate_page(inode->i_mapping, newsize, udf_get_block); if (err) - return err; + goto out_unlock; truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); err = udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); if (err) - return err; + goto out_unlock; } update_time: inode->i_mtime = inode->i_ctime = current_time(inode); @@ -1269,7 +1295,9 @@ update_time: udf_sync_inode(inode); else mark_inode_dirty(inode); - return 0; +out_unlock: + filemap_invalidate_unlock(inode->i_mapping); + return err; } /* @@ -1381,6 +1409,7 @@ reread: ret = -EIO; goto out; } + iinfo->i_hidden = hidden_inode; iinfo->i_unique = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenExtents = 0; @@ -1537,10 +1566,7 @@ reread: case ICBTAG_FILE_TYPE_REGULAR: case ICBTAG_FILE_TYPE_UNDEF: case ICBTAG_FILE_TYPE_VAT20: - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - inode->i_data.a_ops = &udf_adinicb_aops; - else - inode->i_data.a_ops = &udf_aops; + inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; inode->i_mode |= S_IFREG; @@ -1671,7 +1697,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; struct udf_inode_info *iinfo = UDF_I(inode); - bh = udf_tgetblk(inode->i_sb, + bh = sb_getblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0)); if (!bh) { udf_debug("getblk failure\n"); @@ -1716,8 +1742,12 @@ static int udf_update_inode(struct inode *inode, int do_sync) if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0) fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); - else - fe->fileLinkCount = cpu_to_le16(inode->i_nlink); + else { + if (iinfo->i_hidden) + fe->fileLinkCount = cpu_to_le16(0); + else + fe->fileLinkCount = cpu_to_le16(inode->i_nlink); + } fe->informationLength = cpu_to_le64(inode->i_size); @@ -1888,8 +1918,13 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->i_state & I_NEW)) { + if (UDF_I(inode)->i_hidden != hidden_inode) { + iput(inode); + return ERR_PTR(-EFSCORRUPTED); + } return inode; + } memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); err = udf_read_inode(inode, hidden_inode); @@ -1922,7 +1957,7 @@ int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, neloc.logicalBlockNum = block; neloc.partitionReferenceNum = epos->block.partitionReferenceNum; - bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); + bh = sb_getblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); if (!bh) return -EIO; lock_buffer(bh); @@ -2139,7 +2174,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, epos->offset = sizeof(struct allocExtDesc); brelse(epos->bh); block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0); - epos->bh = udf_tread(inode->i_sb, block); + epos->bh = sb_bread(inode->i_sb, block); if (!epos->bh) { udf_debug("reading block %u failed!\n", block); return -1; @@ -2203,12 +2238,13 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, return etype; } -static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, - struct kernel_lb_addr neloc, uint32_t nelen) +static int udf_insert_aext(struct inode *inode, struct extent_position epos, + struct kernel_lb_addr neloc, uint32_t nelen) { struct kernel_lb_addr oeloc; uint32_t oelen; int8_t etype; + int err; if (epos.bh) get_bh(epos.bh); @@ -2218,10 +2254,10 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, neloc = oeloc; nelen = (etype << 30) | oelen; } - udf_add_aext(inode, &epos, &neloc, nelen, 1); + err = udf_add_aext(inode, &epos, &neloc, nelen, 1); brelse(epos.bh); - return (nelen >> 30); + return err; } int8_t udf_delete_aext(struct inode *inode, struct extent_position epos) @@ -2339,28 +2375,3 @@ int8_t inode_bmap(struct inode *inode, sector_t block, return etype; } - -udf_pblk_t udf_block_map(struct inode *inode, sector_t block) -{ - struct kernel_lb_addr eloc; - uint32_t elen; - sector_t offset; - struct extent_position epos = {}; - udf_pblk_t ret; - - down_read(&UDF_I(inode)->i_data_sem); - - if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == - (EXT_RECORDED_ALLOCATED >> 30)) - ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset); - else - ret = 0; - - up_read(&UDF_I(inode)->i_data_sem); - brelse(epos.bh); - - if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV)) - return udf_fixed_to_variable(ret); - else - return ret; -} diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 46d697172197..c87ed942d076 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -45,7 +45,7 @@ unsigned int udf_get_last_session(struct super_block *sb) return 0; } -unsigned long udf_get_last_block(struct super_block *sb) +udf_pblk_t udf_get_last_block(struct super_block *sb) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); unsigned long lblock = 0; @@ -54,8 +54,11 @@ unsigned long udf_get_last_block(struct super_block *sb) * The cdrom layer call failed or returned obviously bogus value? * Try using the device size... */ - if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0) + if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0) { + if (sb_bdev_nr_blocks(sb) > ~(udf_pblk_t)0) + return 0; lblock = sb_bdev_nr_blocks(sb); + } if (lblock) return lblock - 1; diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 1614d308d0f0..3777468d06ce 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -28,22 +28,6 @@ #include "udf_i.h" #include "udf_sb.h" -struct buffer_head *udf_tgetblk(struct super_block *sb, udf_pblk_t block) -{ - if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV)) - return sb_getblk(sb, udf_fixed_to_variable(block)); - else - return sb_getblk(sb, block); -} - -struct buffer_head *udf_tread(struct super_block *sb, udf_pblk_t block) -{ - if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV)) - return sb_bread(sb, udf_fixed_to_variable(block)); - else - return sb_bread(sb, block); -} - struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, uint32_t type, uint8_t loc) { @@ -216,7 +200,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, if (block == 0xFFFFFFFF) return NULL; - bh = udf_tread(sb, block); + bh = sb_bread(sb, block); if (!bh) { udf_err(sb, "read failed, block=%u, location=%u\n", block, location); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 7c95c549dd64..fd20423d3ed2 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -41,283 +41,93 @@ static inline int udf_match(int len1, const unsigned char *name1, int len2, return !memcmp(name1, name2, len1); } -int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, - struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh, - uint8_t *impuse, uint8_t *fileident) -{ - uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag); - uint16_t crc; - int offset; - uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse); - uint8_t lfi = cfi->lengthFileIdent; - int padlen = fibh->eoffset - fibh->soffset - liu - lfi - - sizeof(struct fileIdentDesc); - int adinicb = 0; - - if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - adinicb = 1; - - offset = fibh->soffset + sizeof(struct fileIdentDesc); - - if (impuse) { - if (adinicb || (offset + liu < 0)) { - memcpy((uint8_t *)sfi->impUse, impuse, liu); - } else if (offset >= 0) { - memcpy(fibh->ebh->b_data + offset, impuse, liu); - } else { - memcpy((uint8_t *)sfi->impUse, impuse, -offset); - memcpy(fibh->ebh->b_data, impuse - offset, - liu + offset); - } - } - - offset += liu; - - if (fileident) { - if (adinicb || (offset + lfi < 0)) { - memcpy(sfi->impUse + liu, fileident, lfi); - } else if (offset >= 0) { - memcpy(fibh->ebh->b_data + offset, fileident, lfi); - } else { - memcpy(sfi->impUse + liu, fileident, -offset); - memcpy(fibh->ebh->b_data, fileident - offset, - lfi + offset); - } - } - - offset += lfi; - - if (adinicb || (offset + padlen < 0)) { - memset(sfi->impUse + liu + lfi, 0x00, padlen); - } else if (offset >= 0) { - memset(fibh->ebh->b_data + offset, 0x00, padlen); - } else { - memset(sfi->impUse + liu + lfi, 0x00, -offset); - memset(fibh->ebh->b_data, 0x00, padlen + offset); - } - - crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag), - sizeof(struct fileIdentDesc) - sizeof(struct tag)); - - if (fibh->sbh == fibh->ebh) { - crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, - crclen + sizeof(struct tag) - - sizeof(struct fileIdentDesc)); - } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { - crc = crc_itu_t(crc, fibh->ebh->b_data + - sizeof(struct fileIdentDesc) + - fibh->soffset, - crclen + sizeof(struct tag) - - sizeof(struct fileIdentDesc)); - } else { - crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, - -fibh->soffset - sizeof(struct fileIdentDesc)); - crc = crc_itu_t(crc, fibh->ebh->b_data, fibh->eoffset); - } - - cfi->descTag.descCRC = cpu_to_le16(crc); - cfi->descTag.descCRCLength = cpu_to_le16(crclen); - cfi->descTag.tagChecksum = udf_tag_checksum(&cfi->descTag); - - if (adinicb || (sizeof(struct fileIdentDesc) <= -fibh->soffset)) { - memcpy((uint8_t *)sfi, (uint8_t *)cfi, - sizeof(struct fileIdentDesc)); - } else { - memcpy((uint8_t *)sfi, (uint8_t *)cfi, -fibh->soffset); - memcpy(fibh->ebh->b_data, (uint8_t *)cfi - fibh->soffset, - sizeof(struct fileIdentDesc) + fibh->soffset); - } - - if (adinicb) { - mark_inode_dirty(inode); - } else { - if (fibh->sbh != fibh->ebh) - mark_buffer_dirty_inode(fibh->ebh, inode); - mark_buffer_dirty_inode(fibh->sbh, inode); - } - inode_inc_iversion(inode); - - return 0; -} - /** - * udf_find_entry - find entry in given directory. + * udf_fiiter_find_entry - find entry in given directory. * * @dir: directory inode to search in * @child: qstr of the name - * @fibh: buffer head / inode with file identifier descriptor we found - * @cfi: found file identifier descriptor with given name + * @iter: iter to use for searching * * This function searches in the directory @dir for a file name @child. When - * found, @fibh points to the buffer head(s) (bh is NULL for in ICB - * directories) containing the file identifier descriptor (FID). In that case - * the function returns pointer to the FID in the buffer or inode - but note - * that FID may be split among two buffers (blocks) so accessing it via that - * pointer isn't easily possible. This pointer can be used only as an iterator - * for other directory manipulation functions. For inspection of the FID @cfi - * can be used - the found FID is copied there. + * found, @iter points to the position in the directory with given entry. * - * Returns pointer to FID, NULL when nothing found, or error code. + * Returns 0 on success, < 0 on error (including -ENOENT). */ -static struct fileIdentDesc *udf_find_entry(struct inode *dir, - const struct qstr *child, - struct udf_fileident_bh *fibh, - struct fileIdentDesc *cfi) +static int udf_fiiter_find_entry(struct inode *dir, const struct qstr *child, + struct udf_fileident_iter *iter) { - struct fileIdentDesc *fi = NULL; - loff_t f_pos; - udf_pblk_t block; int flen; - unsigned char *fname = NULL, *copy_name = NULL; - unsigned char *nameptr; - uint8_t lfi; - uint16_t liu; - loff_t size; - struct kernel_lb_addr eloc; - uint32_t elen; - sector_t offset; - struct extent_position epos = {}; - struct udf_inode_info *dinfo = UDF_I(dir); + unsigned char *fname = NULL; + struct super_block *sb = dir->i_sb; int isdotdot = child->len == 2 && child->name[0] == '.' && child->name[1] == '.'; - struct super_block *sb = dir->i_sb; - - size = udf_ext0_offset(dir) + dir->i_size; - f_pos = udf_ext0_offset(dir); - - fibh->sbh = fibh->ebh = NULL; - fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1); - if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos, - &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { - fi = ERR_PTR(-EIO); - goto out_err; - } - - block = udf_get_lb_pblock(sb, &eloc, offset); - if ((++offset << sb->s_blocksize_bits) < elen) { - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - } else - offset = 0; - - fibh->sbh = fibh->ebh = udf_tread(sb, block); - if (!fibh->sbh) { - fi = ERR_PTR(-EIO); - goto out_err; - } - } + int ret; fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); - if (!fname) { - fi = ERR_PTR(-ENOMEM); - goto out_err; - } - - while (f_pos < size) { - fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, - &elen, &offset); - if (!fi) { - fi = ERR_PTR(-EIO); - goto out_err; - } - - liu = le16_to_cpu(cfi->lengthOfImpUse); - lfi = cfi->lengthFileIdent; - - if (fibh->sbh == fibh->ebh) { - nameptr = udf_get_fi_ident(fi); - } else { - int poffset; /* Unpaded ending offset */ - - poffset = fibh->soffset + sizeof(struct fileIdentDesc) + - liu + lfi; - - if (poffset >= lfi) - nameptr = (uint8_t *)(fibh->ebh->b_data + - poffset - lfi); - else { - if (!copy_name) { - copy_name = kmalloc(UDF_NAME_LEN_CS0, - GFP_NOFS); - if (!copy_name) { - fi = ERR_PTR(-ENOMEM); - goto out_err; - } - } - nameptr = copy_name; - memcpy(nameptr, udf_get_fi_ident(fi), - lfi - poffset); - memcpy(nameptr + lfi - poffset, - fibh->ebh->b_data, poffset); - } - } + if (!fname) + return -ENOMEM; - if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { + for (ret = udf_fiiter_init(iter, dir, 0); + !ret && iter->pos < dir->i_size; + ret = udf_fiiter_advance(iter)) { + if (iter->fi.fileCharacteristics & FID_FILE_CHAR_DELETED) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) continue; } - if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { + if (iter->fi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) continue; } - if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) && + if ((iter->fi.fileCharacteristics & FID_FILE_CHAR_PARENT) && isdotdot) goto out_ok; - if (!lfi) + if (!iter->fi.lengthFileIdent) continue; - flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); + flen = udf_get_filename(sb, iter->name, + iter->fi.lengthFileIdent, fname, UDF_NAME_LEN); if (flen < 0) { - fi = ERR_PTR(flen); + ret = flen; goto out_err; } if (udf_match(flen, fname, child->len, child->name)) goto out_ok; } + if (!ret) + ret = -ENOENT; - fi = NULL; out_err: - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); + udf_fiiter_release(iter); out_ok: - brelse(epos.bh); kfree(fname); - kfree(copy_name); - return fi; + return ret; } static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; - struct fileIdentDesc cfi; - struct udf_fileident_bh fibh; - struct fileIdentDesc *fi; + struct udf_fileident_iter iter; + int err; if (dentry->d_name.len > UDF_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - if (IS_ERR(fi)) - return ERR_CAST(fi); + err = udf_fiiter_find_entry(dir, &dentry->d_name, &iter); + if (err < 0 && err != -ENOENT) + return ERR_PTR(err); - if (fi) { + if (err == 0) { struct kernel_lb_addr loc; - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); + loc = lelb_to_cpu(iter.fi.icb.extLocation); + udf_fiiter_release(&iter); - loc = lelb_to_cpu(cfi.icb.extLocation); inode = udf_iget(dir->i_sb, &loc); if (IS_ERR(inode)) return ERR_CAST(inode); @@ -326,287 +136,249 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, return d_splice_alias(inode, dentry); } -static struct fileIdentDesc *udf_add_entry(struct inode *dir, - struct dentry *dentry, - struct udf_fileident_bh *fibh, - struct fileIdentDesc *cfi, int *err) +static int udf_expand_dir_adinicb(struct inode *inode, udf_pblk_t *block) { - struct super_block *sb = dir->i_sb; - struct fileIdentDesc *fi = NULL; - unsigned char *name = NULL; - int namelen; - loff_t f_pos; - loff_t size = udf_ext0_offset(dir) + dir->i_size; - int nfidlen; - udf_pblk_t block; + udf_pblk_t newblock; + struct buffer_head *dbh = NULL; struct kernel_lb_addr eloc; - uint32_t elen = 0; - sector_t offset; - struct extent_position epos = {}; - struct udf_inode_info *dinfo; + struct extent_position epos; + uint8_t alloctype; + struct udf_inode_info *iinfo = UDF_I(inode); + struct udf_fileident_iter iter; + uint8_t *impuse; + int ret; - fibh->sbh = fibh->ebh = NULL; - name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS); - if (!name) { - *err = -ENOMEM; - goto out_err; - } + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + alloctype = ICBTAG_FLAG_AD_SHORT; + else + alloctype = ICBTAG_FLAG_AD_LONG; - if (dentry) { - if (!dentry->d_name.len) { - *err = -EINVAL; - goto out_err; - } - namelen = udf_put_filename(sb, dentry->d_name.name, - dentry->d_name.len, - name, UDF_NAME_LEN_CS0); - if (!namelen) { - *err = -ENAMETOOLONG; - goto out_err; - } - } else { - namelen = 0; + if (!inode->i_size) { + iinfo->i_alloc_type = alloctype; + mark_inode_dirty(inode); + return 0; } - nfidlen = ALIGN(sizeof(struct fileIdentDesc) + namelen, UDF_NAME_PAD); - - f_pos = udf_ext0_offset(dir); - - fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); - dinfo = UDF_I(dir); - if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, - &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { - block = udf_get_lb_pblock(dir->i_sb, - &dinfo->i_location, 0); - fibh->soffset = fibh->eoffset = sb->s_blocksize; - goto add; - } - block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); - if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - } else - offset = 0; - - fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); - if (!fibh->sbh) { - *err = -EIO; - goto out_err; - } + /* alloc block, and copy data to it */ + *block = udf_new_block(inode->i_sb, inode, + iinfo->i_location.partitionReferenceNum, + iinfo->i_location.logicalBlockNum, &ret); + if (!(*block)) + return ret; + newblock = udf_get_pblock(inode->i_sb, *block, + iinfo->i_location.partitionReferenceNum, + 0); + if (newblock == 0xffffffff) + return -EFSCORRUPTED; + dbh = sb_getblk(inode->i_sb, newblock); + if (!dbh) + return -ENOMEM; + lock_buffer(dbh); + memcpy(dbh->b_data, iinfo->i_data, inode->i_size); + memset(dbh->b_data + inode->i_size, 0, + inode->i_sb->s_blocksize - inode->i_size); + set_buffer_uptodate(dbh); + unlock_buffer(dbh); + + /* Drop inline data, add block instead */ + iinfo->i_alloc_type = alloctype; + memset(iinfo->i_data + iinfo->i_lenEAttr, 0, iinfo->i_lenAlloc); + iinfo->i_lenAlloc = 0; + eloc.logicalBlockNum = *block; + eloc.partitionReferenceNum = + iinfo->i_location.partitionReferenceNum; + iinfo->i_lenExtents = inode->i_size; + epos.bh = NULL; + epos.block = iinfo->i_location; + epos.offset = udf_file_entry_alloc_offset(inode); + ret = udf_add_aext(inode, &epos, &eloc, inode->i_size, 0); + brelse(epos.bh); + if (ret < 0) { + brelse(dbh); + udf_free_blocks(inode->i_sb, inode, &eloc, 0, 1); + return ret; + } + mark_inode_dirty(inode); - block = dinfo->i_location.logicalBlockNum; + /* Now fixup tags in moved directory entries */ + for (ret = udf_fiiter_init(&iter, inode, 0); + !ret && iter.pos < inode->i_size; + ret = udf_fiiter_advance(&iter)) { + iter.fi.descTag.tagLocation = cpu_to_le32(*block); + if (iter.fi.lengthOfImpUse != cpu_to_le16(0)) + impuse = dbh->b_data + iter.pos + + sizeof(struct fileIdentDesc); + else + impuse = NULL; + udf_fiiter_write_fi(&iter, impuse); } + brelse(dbh); + /* + * We don't expect the iteration to fail as the directory has been + * already verified to be correct + */ + WARN_ON_ONCE(ret); + udf_fiiter_release(&iter); - while (f_pos < size) { - fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, - &elen, &offset); + return 0; +} - if (!fi) { - *err = -EIO; - goto out_err; - } +static int udf_fiiter_add_entry(struct inode *dir, struct dentry *dentry, + struct udf_fileident_iter *iter) +{ + struct udf_inode_info *dinfo = UDF_I(dir); + int nfidlen, namelen = 0; + int ret; + int off, blksize = 1 << dir->i_blkbits; + udf_pblk_t block; + char name[UDF_NAME_LEN_CS0]; + + if (dentry) { + if (!dentry->d_name.len) + return -EINVAL; + namelen = udf_put_filename(dir->i_sb, dentry->d_name.name, + dentry->d_name.len, + name, UDF_NAME_LEN_CS0); + if (!namelen) + return -ENAMETOOLONG; + } + nfidlen = ALIGN(sizeof(struct fileIdentDesc) + namelen, UDF_NAME_PAD); - if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { - if (udf_dir_entry_len(cfi) == nfidlen) { - cfi->descTag.tagSerialNum = cpu_to_le16(1); - cfi->fileVersionNum = cpu_to_le16(1); - cfi->fileCharacteristics = 0; - cfi->lengthFileIdent = namelen; - cfi->lengthOfImpUse = cpu_to_le16(0); - if (!udf_write_fi(dir, cfi, fi, fibh, NULL, - name)) - goto out_ok; - else { - *err = -EIO; - goto out_err; - } + for (ret = udf_fiiter_init(iter, dir, 0); + !ret && iter->pos < dir->i_size; + ret = udf_fiiter_advance(iter)) { + if (iter->fi.fileCharacteristics & FID_FILE_CHAR_DELETED) { + if (udf_dir_entry_len(&iter->fi) == nfidlen) { + iter->fi.descTag.tagSerialNum = cpu_to_le16(1); + iter->fi.fileVersionNum = cpu_to_le16(1); + iter->fi.fileCharacteristics = 0; + iter->fi.lengthFileIdent = namelen; + iter->fi.lengthOfImpUse = cpu_to_le16(0); + memcpy(iter->namebuf, name, namelen); + iter->name = iter->namebuf; + return 0; } } } - -add: - f_pos += nfidlen; - + if (ret) { + udf_fiiter_release(iter); + return ret; + } if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && - sb->s_blocksize - fibh->eoffset < nfidlen) { - brelse(epos.bh); - epos.bh = NULL; - fibh->soffset -= udf_ext0_offset(dir); - fibh->eoffset -= udf_ext0_offset(dir); - f_pos -= udf_ext0_offset(dir); - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); - fibh->sbh = fibh->ebh = - udf_expand_dir_adinicb(dir, &block, err); - if (!fibh->sbh) - goto out_err; - epos.block = dinfo->i_location; - epos.offset = udf_file_entry_alloc_offset(dir); - /* Load extent udf_expand_dir_adinicb() has created */ - udf_current_aext(dir, &epos, &eloc, &elen, 1); + blksize - udf_ext0_offset(dir) - iter->pos < nfidlen) { + udf_fiiter_release(iter); + ret = udf_expand_dir_adinicb(dir, &block); + if (ret) + return ret; + ret = udf_fiiter_init(iter, dir, dir->i_size); + if (ret < 0) + return ret; } - /* Entry fits into current block? */ - if (sb->s_blocksize - fibh->eoffset >= nfidlen) { - fibh->soffset = fibh->eoffset; - fibh->eoffset += nfidlen; - if (fibh->sbh != fibh->ebh) { - brelse(fibh->sbh); - fibh->sbh = fibh->ebh; - } - - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - block = dinfo->i_location.logicalBlockNum; - fi = (struct fileIdentDesc *) - (dinfo->i_data + fibh->soffset - - udf_ext0_offset(dir) + - dinfo->i_lenEAttr); - } else { - block = eloc.logicalBlockNum + - ((elen - 1) >> - dir->i_sb->s_blocksize_bits); - fi = (struct fileIdentDesc *) - (fibh->sbh->b_data + fibh->soffset); - } + /* Get blocknumber to use for entry tag */ + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + block = dinfo->i_location.logicalBlockNum; } else { - /* Round up last extent in the file */ - elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - udf_write_aext(dir, &epos, &eloc, elen, 1); - dinfo->i_lenExtents = (dinfo->i_lenExtents + sb->s_blocksize - - 1) & ~(sb->s_blocksize - 1); - - fibh->soffset = fibh->eoffset - sb->s_blocksize; - fibh->eoffset += nfidlen - sb->s_blocksize; - if (fibh->sbh != fibh->ebh) { - brelse(fibh->sbh); - fibh->sbh = fibh->ebh; - } + block = iter->eloc.logicalBlockNum + + ((iter->elen - 1) >> dir->i_blkbits); + } + off = iter->pos & (blksize - 1); + if (!off) + off = blksize; + /* Entry fits into current block? */ + if (blksize - udf_ext0_offset(dir) - off >= nfidlen) + goto store_fi; - block = eloc.logicalBlockNum + ((elen - 1) >> - dir->i_sb->s_blocksize_bits); - fibh->ebh = udf_bread(dir, - f_pos >> dir->i_sb->s_blocksize_bits, 1, err); - if (!fibh->ebh) - goto out_err; - /* Extents could have been merged, invalidate our position */ - brelse(epos.bh); - epos.bh = NULL; - epos.block = dinfo->i_location; - epos.offset = udf_file_entry_alloc_offset(dir); - - if (!fibh->soffset) { - /* Find the freshly allocated block */ - while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == - (EXT_RECORDED_ALLOCATED >> 30)) - ; - block = eloc.logicalBlockNum + ((elen - 1) >> - dir->i_sb->s_blocksize_bits); - brelse(fibh->sbh); - fibh->sbh = fibh->ebh; - fi = (struct fileIdentDesc *)(fibh->sbh->b_data); - } else { - fi = (struct fileIdentDesc *) - (fibh->sbh->b_data + sb->s_blocksize + - fibh->soffset); - } + ret = udf_fiiter_append_blk(iter); + if (ret) { + udf_fiiter_release(iter); + return ret; } - memset(cfi, 0, sizeof(struct fileIdentDesc)); - if (UDF_SB(sb)->s_udfrev >= 0x0200) - udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block, + /* Entry will be completely in the new block? Update tag location... */ + if (!(iter->pos & (blksize - 1))) + block = iter->eloc.logicalBlockNum + + ((iter->elen - 1) >> dir->i_blkbits); +store_fi: + memset(&iter->fi, 0, sizeof(struct fileIdentDesc)); + if (UDF_SB(dir->i_sb)->s_udfrev >= 0x0200) + udf_new_tag((char *)(&iter->fi), TAG_IDENT_FID, 3, 1, block, sizeof(struct tag)); else - udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block, + udf_new_tag((char *)(&iter->fi), TAG_IDENT_FID, 2, 1, block, sizeof(struct tag)); - cfi->fileVersionNum = cpu_to_le16(1); - cfi->lengthFileIdent = namelen; - cfi->lengthOfImpUse = cpu_to_le16(0); - if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) { - dir->i_size += nfidlen; - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - dinfo->i_lenAlloc += nfidlen; - else { - /* Find the last extent and truncate it to proper size */ - while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == - (EXT_RECORDED_ALLOCATED >> 30)) - ; - elen -= dinfo->i_lenExtents - dir->i_size; - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - udf_write_aext(dir, &epos, &eloc, elen, 1); - dinfo->i_lenExtents = dir->i_size; - } - - mark_inode_dirty(dir); - goto out_ok; + iter->fi.fileVersionNum = cpu_to_le16(1); + iter->fi.lengthFileIdent = namelen; + iter->fi.lengthOfImpUse = cpu_to_le16(0); + memcpy(iter->namebuf, name, namelen); + iter->name = iter->namebuf; + + dir->i_size += nfidlen; + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + dinfo->i_lenAlloc += nfidlen; } else { - *err = -EIO; - goto out_err; + /* Truncate last extent to proper size */ + udf_fiiter_update_elen(iter, iter->elen - + (dinfo->i_lenExtents - dir->i_size)); } + mark_inode_dirty(dir); -out_err: - fi = NULL; - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); -out_ok: - brelse(epos.bh); - kfree(name); - return fi; + return 0; } -static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, - struct udf_fileident_bh *fibh, - struct fileIdentDesc *cfi) +static void udf_fiiter_delete_entry(struct udf_fileident_iter *iter) { - cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED; + iter->fi.fileCharacteristics |= FID_FILE_CHAR_DELETED; + + if (UDF_QUERY_FLAG(iter->dir->i_sb, UDF_FLAG_STRICT)) + memset(&iter->fi.icb, 0x00, sizeof(struct long_ad)); + + udf_fiiter_write_fi(iter, NULL); +} - if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) - memset(&(cfi->icb), 0x00, sizeof(struct long_ad)); +static void udf_add_fid_counter(struct super_block *sb, bool dir, int val) +{ + struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); - return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); + if (!lvidiu) + return; + mutex_lock(&UDF_SB(sb)->s_alloc_mutex); + if (dir) + le32_add_cpu(&lvidiu->numDirs, val); + else + le32_add_cpu(&lvidiu->numFiles, val); + udf_updated_lvid(sb); + mutex_unlock(&UDF_SB(sb)->s_alloc_mutex); } static int udf_add_nondir(struct dentry *dentry, struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); struct inode *dir = d_inode(dentry->d_parent); - struct udf_fileident_bh fibh; - struct fileIdentDesc cfi, *fi; + struct udf_fileident_iter iter; int err; - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (unlikely(!fi)) { + err = udf_fiiter_add_entry(dir, dentry, &iter); + if (err) { inode_dec_link_count(inode); discard_new_inode(inode); return err; } - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + udf_fiiter_write_fi(&iter, NULL); dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); + udf_fiiter_release(&iter); + udf_add_fid_counter(dir->i_sb, false, 1); d_instantiate_new(dentry, inode); return 0; } -static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode = udf_new_inode(dir, mode); @@ -614,10 +386,7 @@ static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - inode->i_data.a_ops = &udf_adinicb_aops; - else - inode->i_data.a_ops = &udf_aops; + inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); @@ -625,7 +394,7 @@ static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, return udf_add_nondir(dentry, inode); } -static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode = udf_new_inode(dir, mode); @@ -633,10 +402,7 @@ static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - inode->i_data.a_ops = &udf_adinicb_aops; - else - inode->i_data.a_ops = &udf_aops; + inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); @@ -645,7 +411,7 @@ static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return finish_open_simple(file, 0); } -static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -661,12 +427,11 @@ static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir, return udf_add_nondir(dentry, inode); } -static int udf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - struct udf_fileident_bh fibh; - struct fileIdentDesc cfi, *fi; + struct udf_fileident_iter iter; int err; struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; @@ -678,183 +443,113 @@ static int udf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, iinfo = UDF_I(inode); inode->i_op = &udf_dir_inode_operations; inode->i_fop = &udf_dir_operations; - fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); - if (!fi) { - inode_dec_link_count(inode); + err = udf_fiiter_add_entry(inode, NULL, &iter); + if (err) { + clear_nlink(inode); discard_new_inode(inode); - goto out; + return err; } set_nlink(inode, 2); - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location); - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + iter.fi.icb.extLocation = cpu_to_lelb(dinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(dinfo->i_unique & 0x00000000FFFFFFFFUL); - cfi.fileCharacteristics = + iter.fi.fileCharacteristics = FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; - udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); - brelse(fibh.sbh); + udf_fiiter_write_fi(&iter, NULL); + udf_fiiter_release(&iter); mark_inode_dirty(inode); - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) { + err = udf_fiiter_add_entry(dir, dentry, &iter); + if (err) { clear_nlink(inode); - mark_inode_dirty(inode); discard_new_inode(inode); - goto out; + return err; } - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location); + *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); - cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY; - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); + iter.fi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY; + udf_fiiter_write_fi(&iter, NULL); + udf_fiiter_release(&iter); + udf_add_fid_counter(dir->i_sb, true, 1); inc_nlink(dir); dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); d_instantiate_new(dentry, inode); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - err = 0; -out: - return err; + return 0; } static int empty_dir(struct inode *dir) { - struct fileIdentDesc *fi, cfi; - struct udf_fileident_bh fibh; - loff_t f_pos; - loff_t size = udf_ext0_offset(dir) + dir->i_size; - udf_pblk_t block; - struct kernel_lb_addr eloc; - uint32_t elen; - sector_t offset; - struct extent_position epos = {}; - struct udf_inode_info *dinfo = UDF_I(dir); - - f_pos = udf_ext0_offset(dir); - fibh.soffset = fibh.eoffset = f_pos & (dir->i_sb->s_blocksize - 1); - - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - fibh.sbh = fibh.ebh = NULL; - else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, - &epos, &eloc, &elen, &offset) == - (EXT_RECORDED_ALLOCATED >> 30)) { - block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); - if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - } else - offset = 0; - - fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block); - if (!fibh.sbh) { - brelse(epos.bh); + struct udf_fileident_iter iter; + int ret; + + for (ret = udf_fiiter_init(&iter, dir, 0); + !ret && iter.pos < dir->i_size; + ret = udf_fiiter_advance(&iter)) { + if (iter.fi.lengthFileIdent && + !(iter.fi.fileCharacteristics & FID_FILE_CHAR_DELETED)) { + udf_fiiter_release(&iter); return 0; } - } else { - brelse(epos.bh); - return 0; } - - while (f_pos < size) { - fi = udf_fileident_read(dir, &f_pos, &fibh, &cfi, &epos, &eloc, - &elen, &offset); - if (!fi) { - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); - return 0; - } - - if (cfi.lengthFileIdent && - (cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) == 0) { - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); - return 0; - } - } - - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); + udf_fiiter_release(&iter); return 1; } static int udf_rmdir(struct inode *dir, struct dentry *dentry) { - int retval; + int ret; struct inode *inode = d_inode(dentry); - struct udf_fileident_bh fibh; - struct fileIdentDesc *fi, cfi; + struct udf_fileident_iter iter; struct kernel_lb_addr tloc; - retval = -ENOENT; - fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - if (IS_ERR_OR_NULL(fi)) { - if (fi) - retval = PTR_ERR(fi); + ret = udf_fiiter_find_entry(dir, &dentry->d_name, &iter); + if (ret) goto out; - } - retval = -EIO; - tloc = lelb_to_cpu(cfi.icb.extLocation); + ret = -EFSCORRUPTED; + tloc = lelb_to_cpu(iter.fi.icb.extLocation); if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) goto end_rmdir; - retval = -ENOTEMPTY; + ret = -ENOTEMPTY; if (!empty_dir(inode)) goto end_rmdir; - retval = udf_delete_entry(dir, fi, &fibh, &cfi); - if (retval) - goto end_rmdir; + udf_fiiter_delete_entry(&iter); if (inode->i_nlink != 2) udf_warn(inode->i_sb, "empty directory has nlink != 2 (%u)\n", inode->i_nlink); clear_nlink(inode); inode->i_size = 0; inode_dec_link_count(dir); + udf_add_fid_counter(dir->i_sb, true, -1); inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); mark_inode_dirty(dir); - + ret = 0; end_rmdir: - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - + udf_fiiter_release(&iter); out: - return retval; + return ret; } static int udf_unlink(struct inode *dir, struct dentry *dentry) { - int retval; + int ret; struct inode *inode = d_inode(dentry); - struct udf_fileident_bh fibh; - struct fileIdentDesc *fi; - struct fileIdentDesc cfi; + struct udf_fileident_iter iter; struct kernel_lb_addr tloc; - retval = -ENOENT; - fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - - if (IS_ERR_OR_NULL(fi)) { - if (fi) - retval = PTR_ERR(fi); + ret = udf_fiiter_find_entry(dir, &dentry->d_name, &iter); + if (ret) goto out; - } - retval = -EIO; - tloc = lelb_to_cpu(cfi.icb.extLocation); + ret = -EFSCORRUPTED; + tloc = lelb_to_cpu(iter.fi.icb.extLocation); if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) goto end_unlink; @@ -863,25 +558,20 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } - retval = udf_delete_entry(dir, fi, &fibh, &cfi); - if (retval) - goto end_unlink; + udf_fiiter_delete_entry(&iter); dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); inode_dec_link_count(inode); + udf_add_fid_counter(dir->i_sb, false, -1); inode->i_ctime = dir->i_ctime; - retval = 0; - + ret = 0; end_unlink: - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - + udf_fiiter_release(&iter); out: - return retval; + return ret; } -static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int udf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777); @@ -929,15 +619,20 @@ static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir, iinfo->i_location.partitionReferenceNum; bsize = sb->s_blocksize; iinfo->i_lenExtents = bsize; - udf_add_aext(inode, &epos, &eloc, bsize, 0); + err = udf_add_aext(inode, &epos, &eloc, bsize, 0); brelse(epos.bh); + if (err < 0) { + udf_free_blocks(sb, inode, &eloc, 0, 1); + goto out_no_entry; + } block = udf_get_pblock(sb, block, iinfo->i_location.partitionReferenceNum, 0); - epos.bh = udf_tgetblk(sb, block); + epos.bh = sb_getblk(sb, block); if (unlikely(!epos.bh)) { err = -ENOMEM; + udf_free_blocks(sb, inode, &eloc, 0, 1); goto out_no_entry; } lock_buffer(epos.bh); @@ -1038,28 +733,23 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); - struct udf_fileident_bh fibh; - struct fileIdentDesc cfi, *fi; + struct udf_fileident_iter iter; int err; - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) { + err = udf_fiiter_add_entry(dir, dentry, &iter); + if (err) return err; - } - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); + iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); + iter.fi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); if (UDF_SB(inode->i_sb)->s_lvid_bh) { - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = + *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(lvid_get_unique_id(inode->i_sb)); } - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); - if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(dir); + udf_fiiter_write_fi(&iter, NULL); + udf_fiiter_release(&iter); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); inc_nlink(inode); + udf_add_fid_counter(dir->i_sb, false, 1); inode->i_ctime = current_time(inode); mark_inode_dirty(inode); dir->i_ctime = dir->i_mtime = current_time(dir); @@ -1073,84 +763,81 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, /* Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ -static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct udf_fileident_bh ofibh, nfibh; - struct fileIdentDesc *ofi = NULL, *nfi = NULL, *dir_fi = NULL; - struct fileIdentDesc ocfi, ncfi; - struct buffer_head *dir_bh = NULL; - int retval = -ENOENT; + struct udf_fileident_iter oiter, niter, diriter; + bool has_diriter = false; + int retval; struct kernel_lb_addr tloc; - struct udf_inode_info *old_iinfo = UDF_I(old_inode); if (flags & ~RENAME_NOREPLACE) return -EINVAL; - ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); - if (!ofi || IS_ERR(ofi)) { - if (IS_ERR(ofi)) - retval = PTR_ERR(ofi); - goto end_rename; - } - - if (ofibh.sbh != ofibh.ebh) - brelse(ofibh.ebh); - - brelse(ofibh.sbh); - tloc = lelb_to_cpu(ocfi.icb.extLocation); - if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) - goto end_rename; + retval = udf_fiiter_find_entry(old_dir, &old_dentry->d_name, &oiter); + if (retval) + return retval; - nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi); - if (IS_ERR(nfi)) { - retval = PTR_ERR(nfi); - goto end_rename; - } - if (nfi && !new_inode) { - if (nfibh.sbh != nfibh.ebh) - brelse(nfibh.ebh); - brelse(nfibh.sbh); - nfi = NULL; + tloc = lelb_to_cpu(oiter.fi.icb.extLocation); + if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) { + retval = -ENOENT; + goto out_oiter; } - if (S_ISDIR(old_inode->i_mode)) { - int offset = udf_ext0_offset(old_inode); + if (S_ISDIR(old_inode->i_mode)) { if (new_inode) { retval = -ENOTEMPTY; if (!empty_dir(new_inode)) - goto end_rename; + goto out_oiter; + } + /* + * We need to protect against old_inode getting converted from + * ICB to normal directory. + */ + inode_lock_nested(old_inode, I_MUTEX_NONDIR2); + retval = udf_fiiter_find_entry(old_inode, &dotdot_name, + &diriter); + if (retval == -ENOENT) { + udf_err(old_inode->i_sb, + "directory (ino %lu) has no '..' entry\n", + old_inode->i_ino); + retval = -EFSCORRUPTED; } - retval = -EIO; - if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - dir_fi = udf_get_fileident( - old_iinfo->i_data - - (old_iinfo->i_efe ? - sizeof(struct extendedFileEntry) : - sizeof(struct fileEntry)), - old_inode->i_sb->s_blocksize, &offset); - } else { - dir_bh = udf_bread(old_inode, 0, 0, &retval); - if (!dir_bh) - goto end_rename; - dir_fi = udf_get_fileident(dir_bh->b_data, - old_inode->i_sb->s_blocksize, &offset); + if (retval) { + inode_unlock(old_inode); + goto out_oiter; } - if (!dir_fi) - goto end_rename; - tloc = lelb_to_cpu(dir_fi->icb.extLocation); + has_diriter = true; + tloc = lelb_to_cpu(diriter.fi.icb.extLocation); if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) != - old_dir->i_ino) - goto end_rename; + old_dir->i_ino) { + retval = -EFSCORRUPTED; + udf_err(old_inode->i_sb, + "directory (ino %lu) has parent entry pointing to another inode (%lu != %u)\n", + old_inode->i_ino, old_dir->i_ino, + udf_get_lb_pblock(old_inode->i_sb, &tloc, 0)); + goto out_oiter; + } + } + + retval = udf_fiiter_find_entry(new_dir, &new_dentry->d_name, &niter); + if (retval && retval != -ENOENT) + goto out_oiter; + /* Entry found but not passed by VFS? */ + if (!retval && !new_inode) { + retval = -EFSCORRUPTED; + udf_fiiter_release(&niter); + goto out_oiter; } - if (!nfi) { - nfi = udf_add_entry(new_dir, new_dentry, &nfibh, &ncfi, - &retval); - if (!nfi) - goto end_rename; + /* Entry not found? Need to add one... */ + if (retval) { + udf_fiiter_release(&niter); + retval = udf_fiiter_add_entry(new_dir, new_dentry, &niter); + if (retval) + goto out_oiter; } /* @@ -1163,31 +850,46 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, /* * ok, that's it */ - ncfi.fileVersionNum = ocfi.fileVersionNum; - ncfi.fileCharacteristics = ocfi.fileCharacteristics; - memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(ocfi.icb)); - udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); + niter.fi.fileVersionNum = oiter.fi.fileVersionNum; + niter.fi.fileCharacteristics = oiter.fi.fileCharacteristics; + memcpy(&(niter.fi.icb), &(oiter.fi.icb), sizeof(oiter.fi.icb)); + udf_fiiter_write_fi(&niter, NULL); + udf_fiiter_release(&niter); - /* The old fid may have moved - find it again */ - ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); - udf_delete_entry(old_dir, ofi, &ofibh, &ocfi); + /* + * The old entry may have moved due to new entry allocation. Find it + * again. + */ + udf_fiiter_release(&oiter); + retval = udf_fiiter_find_entry(old_dir, &old_dentry->d_name, &oiter); + if (retval) { + udf_err(old_dir->i_sb, + "failed to find renamed entry again in directory (ino %lu)\n", + old_dir->i_ino); + } else { + udf_fiiter_delete_entry(&oiter); + udf_fiiter_release(&oiter); + } if (new_inode) { new_inode->i_ctime = current_time(new_inode); inode_dec_link_count(new_inode); + udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode), + -1); } old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); mark_inode_dirty(old_dir); mark_inode_dirty(new_dir); - if (dir_fi) { - dir_fi->icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location); - udf_update_tag((char *)dir_fi, udf_dir_entry_len(dir_fi)); - if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(old_inode); - else - mark_buffer_dirty_inode(dir_bh, old_inode); + if (has_diriter) { + diriter.fi.icb.extLocation = + cpu_to_lelb(UDF_I(new_dir)->i_location); + udf_update_tag((char *)&diriter.fi, + udf_dir_entry_len(&diriter.fi)); + udf_fiiter_write_fi(&diriter, NULL); + udf_fiiter_release(&diriter); + inode_unlock(old_inode); inode_dec_link_count(old_dir); if (new_inode) @@ -1197,22 +899,13 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, mark_inode_dirty(new_dir); } } - - if (ofi) { - if (ofibh.sbh != ofibh.ebh) - brelse(ofibh.ebh); - brelse(ofibh.sbh); - } - - retval = 0; - -end_rename: - brelse(dir_bh); - if (nfi) { - if (nfibh.sbh != nfibh.ebh) - brelse(nfibh.ebh); - brelse(nfibh.sbh); + return 0; +out_oiter: + if (has_diriter) { + udf_fiiter_release(&diriter); + inode_unlock(old_inode); } + udf_fiiter_release(&oiter); return retval; } @@ -1221,17 +914,15 @@ static struct dentry *udf_get_parent(struct dentry *child) { struct kernel_lb_addr tloc; struct inode *inode = NULL; - struct fileIdentDesc cfi; - struct udf_fileident_bh fibh; - - if (!udf_find_entry(d_inode(child), &dotdot_name, &fibh, &cfi)) - return ERR_PTR(-EACCES); + struct udf_fileident_iter iter; + int err; - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); + err = udf_fiiter_find_entry(d_inode(child), &dotdot_name, &iter); + if (err) + return ERR_PTR(err); - tloc = lelb_to_cpu(cfi.icb.extLocation); + tloc = lelb_to_cpu(iter.fi.icb.extLocation); + udf_fiiter_release(&iter); inode = udf_iget(child->d_sb, &tloc); if (IS_ERR(inode)) return ERR_CAST(inode); diff --git a/fs/udf/partition.c b/fs/udf/partition.c index 4cbf40575965..5bcfe78d5cab 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -54,6 +54,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, struct udf_part_map *map; struct udf_virtual_data *vdata; struct udf_inode_info *iinfo = UDF_I(sbi->s_vat_inode); + int err; map = &sbi->s_partmaps[partition]; vdata = &map->s_type_specific.s_virtual; @@ -79,12 +80,10 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, index = vdata->s_start_offset / sizeof(uint32_t) + block; } - loc = udf_block_map(sbi->s_vat_inode, newblock); - - bh = sb_bread(sb, loc); + bh = udf_bread(sbi->s_vat_inode, newblock, 0, &err); if (!bh) { - udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%u,%u) VAT: %u[%u]\n", - sb, block, partition, loc, index); + udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%u,%u)\n", + sb, block, partition); return 0xFFFFFFFF; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 06eda8177b5f..6304e3c5c3d9 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -86,6 +86,13 @@ enum { #define UDF_MAX_LVID_NESTING 1000 enum { UDF_MAX_LINKS = 0xffff }; +/* + * We limit filesize to 4TB. This is arbitrary as the on-disk format supports + * more but because the file space is described by a linked list of extents, + * each of which can have at most 1GB, the creation and handling of extents + * gets unusably slow beyond certain point... + */ +#define UDF_MAX_FILESIZE (1ULL << 42) /* These are the "meat" - everything else is stuffing */ static int udf_fill_super(struct super_block *, void *, int); @@ -147,6 +154,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb) ei->i_next_alloc_goal = 0; ei->i_strat4096 = 0; ei->i_streamdir = 0; + ei->i_hidden = 0; init_rwsem(&ei->i_data_sem); ei->cached_extent.lstart = -1; spin_lock_init(&ei->i_extent_cache_lock); @@ -733,7 +741,7 @@ static int udf_check_vsd(struct super_block *sb) * added */ for (; !nsr && sector < VSD_MAX_SECTOR_OFFSET; sector += sectorsize) { /* Read a block */ - bh = udf_tread(sb, sector >> sb->s_blocksize_bits); + bh = sb_bread(sb, sector >> sb->s_blocksize_bits); if (!bh) break; @@ -1175,7 +1183,6 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) struct udf_part_map *map = &sbi->s_partmaps[p_index]; struct buffer_head *bh = NULL; struct udf_inode_info *vati; - uint32_t pos; struct virtualAllocationTable20 *vat20; sector_t blocks = sb_bdev_nr_blocks(sb); @@ -1197,10 +1204,14 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) { vati = UDF_I(sbi->s_vat_inode); if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - pos = udf_block_map(sbi->s_vat_inode, 0); - bh = sb_bread(sb, pos); - if (!bh) - return -EIO; + int err = 0; + + bh = udf_bread(sbi->s_vat_inode, 0, 0, &err); + if (!bh) { + if (!err) + err = -EFSCORRUPTED; + return err; + } vat20 = (struct virtualAllocationTable20 *)bh->b_data; } else { vat20 = (struct virtualAllocationTable20 *) @@ -1838,10 +1849,6 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block, uint16_t ident; int ret; - if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && - udf_fixed_to_variable(block) >= sb_bdev_nr_blocks(sb)) - return -EAGAIN; - bh = udf_read_tagged(sb, block, block, &ident); if (!bh) return -EAGAIN; @@ -1860,10 +1867,10 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block, * Returns < 0 on error, 0 on success. -EAGAIN is special - try next set * of anchors. */ -static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, +static int udf_scan_anchors(struct super_block *sb, udf_pblk_t *lastblock, struct kernel_lb_addr *fileset) { - sector_t last[6]; + udf_pblk_t last[6]; int i; struct udf_sb_info *sbi = UDF_SB(sb); int last_count = 0; @@ -1924,46 +1931,6 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, } /* - * Find an anchor volume descriptor and load Volume Descriptor Sequence from - * area specified by it. The function expects sbi->s_lastblock to be the last - * block on the media. - * - * Return <0 on error, 0 if anchor found. -EAGAIN is special meaning anchor - * was not found. - */ -static int udf_find_anchor(struct super_block *sb, - struct kernel_lb_addr *fileset) -{ - struct udf_sb_info *sbi = UDF_SB(sb); - sector_t lastblock = sbi->s_last_block; - int ret; - - ret = udf_scan_anchors(sb, &lastblock, fileset); - if (ret != -EAGAIN) - goto out; - - /* No anchor found? Try VARCONV conversion of block numbers */ - UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); - lastblock = udf_variable_to_fixed(sbi->s_last_block); - /* Firstly, we try to not convert number of the last block */ - ret = udf_scan_anchors(sb, &lastblock, fileset); - if (ret != -EAGAIN) - goto out; - - lastblock = sbi->s_last_block; - /* Secondly, we try with converted number of the last block */ - ret = udf_scan_anchors(sb, &lastblock, fileset); - if (ret < 0) { - /* VARCONV didn't help. Clear it. */ - UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); - } -out: - if (ret == 0) - sbi->s_last_block = lastblock; - return ret; -} - -/* * Check Volume Structure Descriptor, find Anchor block and load Volume * Descriptor Sequence. * @@ -2003,7 +1970,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, /* Look for anchor block and load Volume Descriptor Sequence */ sbi->s_anchor = uopt->anchor; - ret = udf_find_anchor(sb, fileset); + ret = udf_scan_anchors(sb, &sbi->s_last_block, fileset); if (ret < 0) { if (!silent && ret == -EAGAIN) udf_warn(sb, "No anchor found\n"); @@ -2297,7 +2264,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) ret = -ENOMEM; goto error_out; } - sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_maxbytes = UDF_MAX_FILESIZE; sb->s_max_links = UDF_MAX_LINKS; return 0; @@ -2454,7 +2421,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, if (bytes) { brelse(bh); newblock = udf_get_lb_pblock(sb, &loc, ++block); - bh = udf_tread(sb, newblock); + bh = sb_bread(sb, newblock); if (!bh) { udf_debug("read failed\n"); goto out; diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index f3642f9c23f8..a34c8c4e6d21 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -107,53 +107,45 @@ static int udf_symlink_filler(struct file *file, struct folio *folio) struct inode *inode = page->mapping->host; struct buffer_head *bh = NULL; unsigned char *symlink; - int err; + int err = 0; unsigned char *p = page_address(page); - struct udf_inode_info *iinfo; - uint32_t pos; + struct udf_inode_info *iinfo = UDF_I(inode); /* We don't support symlinks longer than one block */ if (inode->i_size > inode->i_sb->s_blocksize) { err = -ENAMETOOLONG; - goto out_unmap; + goto out_unlock; } - iinfo = UDF_I(inode); - pos = udf_block_map(inode, 0); - - down_read(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { symlink = iinfo->i_data + iinfo->i_lenEAttr; } else { - bh = sb_bread(inode->i_sb, pos); - + bh = udf_bread(inode, 0, 0, &err); if (!bh) { - err = -EIO; - goto out_unlock_inode; + if (!err) + err = -EFSCORRUPTED; + goto out_err; } - symlink = bh->b_data; } err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE); brelse(bh); if (err) - goto out_unlock_inode; + goto out_err; - up_read(&iinfo->i_data_sem); SetPageUptodate(page); unlock_page(page); return 0; -out_unlock_inode: - up_read(&iinfo->i_data_sem); +out_err: SetPageError(page); -out_unmap: +out_unlock: unlock_page(page); return err; } -static int udf_symlink_getattr(struct user_namespace *mnt_userns, +static int udf_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -161,7 +153,7 @@ static int udf_symlink_getattr(struct user_namespace *mnt_userns, struct inode *inode = d_backing_inode(dentry); struct page *page; - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(&nop_mnt_idmap, inode, stat); page = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(page)) return PTR_ERR(page); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 036ebd892b85..871856c69df5 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -125,7 +125,7 @@ void udf_discard_prealloc(struct inode *inode) struct kernel_lb_addr eloc; uint32_t elen; uint64_t lbcount = 0; - int8_t etype = -1, netype; + int8_t etype = -1; struct udf_inode_info *iinfo = UDF_I(inode); int bsize = 1 << inode->i_blkbits; @@ -136,7 +136,7 @@ void udf_discard_prealloc(struct inode *inode) epos.block = iinfo->i_location; /* Find the last extent in the file */ - while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) { + while (udf_next_aext(inode, &epos, &eloc, &elen, 0) != -1) { brelse(prev_epos.bh); prev_epos = epos; if (prev_epos.bh) @@ -240,7 +240,7 @@ int udf_truncate_extents(struct inode *inode) brelse(epos.bh); epos.offset = sizeof(struct allocExtDesc); epos.block = eloc; - epos.bh = udf_tread(sb, + epos.bh = sb_bread(sb, udf_get_lb_pblock(sb, &eloc, 0)); /* Error reading indirect block? */ if (!epos.bh) diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index 06ff7006b822..312b7c9ef10e 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -44,7 +44,8 @@ struct udf_inode_info { unsigned i_use : 1; /* unallocSpaceEntry */ unsigned i_strat4096 : 1; unsigned i_streamdir : 1; - unsigned reserved : 25; + unsigned i_hidden : 1; /* hidden system inode */ + unsigned reserved : 24; __u8 *i_data; struct kernel_lb_addr i_locStreamdir; __u64 i_lenStreams; diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 291b56dd011e..9af6ff7f9747 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -23,7 +23,6 @@ #define UDF_FLAG_STRICT 5 #define UDF_FLAG_UNDELETE 6 #define UDF_FLAG_UNHIDE 7 -#define UDF_FLAG_VARCONV 8 #define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ #define UDF_FLAG_GID_FORGET 12 #define UDF_FLAG_UID_SET 13 @@ -55,6 +54,8 @@ #define MF_DUPLICATE_MD 0x01 #define MF_MIRROR_FE_LOADED 0x02 +#define EFSCORRUPTED EUCLEAN + struct udf_meta_data { __u32 s_meta_file_loc; __u32 s_mirror_file_loc; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 7e258f15b8ef..88692512a466 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -34,9 +34,6 @@ extern __printf(3, 4) void _udf_warn(struct super_block *sb, #define udf_debug(fmt, ...) \ pr_debug("%s:%d:%s: " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__) -#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) ) -#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) ) - #define UDF_EXTENT_LENGTH_MASK 0x3FFFFFFF #define UDF_EXTENT_FLAG_MASK 0xC0000000 @@ -83,14 +80,24 @@ extern const struct inode_operations udf_file_inode_operations; extern const struct file_operations udf_file_operations; extern const struct inode_operations udf_symlink_inode_operations; extern const struct address_space_operations udf_aops; -extern const struct address_space_operations udf_adinicb_aops; extern const struct address_space_operations udf_symlink_aops; -struct udf_fileident_bh { - struct buffer_head *sbh; - struct buffer_head *ebh; - int soffset; - int eoffset; +struct udf_fileident_iter { + struct inode *dir; /* Directory we are working with */ + loff_t pos; /* Logical position in a dir */ + struct buffer_head *bh[2]; /* Buffer containing 'pos' and possibly + * next buffer if entry straddles + * blocks */ + struct kernel_lb_addr eloc; /* Start of extent containing 'pos' */ + uint32_t elen; /* Length of extent containing 'pos' */ + sector_t loffset; /* Block offset of 'pos' within above + * extent */ + struct extent_position epos; /* Position after the above extent */ + struct fileIdentDesc fi; /* Copied directory entry */ + uint8_t *name; /* Pointer to entry name */ + uint8_t *namebuf; /* Storage for entry name in case + * the name is split between two blocks + */ }; struct udf_vds_record { @@ -121,22 +128,16 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb, u32 meta_file_loc, u32 partition_num); /* namei.c */ -extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, - struct fileIdentDesc *, struct udf_fileident_bh *, - uint8_t *, uint8_t *); static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi) { return ALIGN(sizeof(struct fileIdentDesc) + le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent, UDF_NAME_PAD); } -static inline uint8_t *udf_get_fi_ident(struct fileIdentDesc *fi) -{ - return ((uint8_t *)(fi + 1)) + le16_to_cpu(fi->lengthOfImpUse); -} /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); + /* inode.c */ extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *, bool hidden_inode); @@ -151,16 +152,14 @@ static inline struct inode *udf_iget(struct super_block *sb, return __udf_iget(sb, ino, false); } extern int udf_expand_file_adinicb(struct inode *); -extern struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, - udf_pblk_t *block, int *err); extern struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int create, int *err); extern int udf_setsize(struct inode *, loff_t); extern void udf_evict_inode(struct inode *); extern int udf_write_inode(struct inode *, struct writeback_control *wbc); -extern udf_pblk_t udf_block_map(struct inode *inode, sector_t block); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, struct kernel_lb_addr *, uint32_t *, sector_t *); +int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, struct extent_position *epos); extern int __udf_add_aext(struct inode *inode, struct extent_position *epos, @@ -177,9 +176,6 @@ extern int8_t udf_current_aext(struct inode *, struct extent_position *, extern void udf_update_extra_perms(struct inode *inode, umode_t mode); /* misc.c */ -extern struct buffer_head *udf_tgetblk(struct super_block *sb, - udf_pblk_t block); -extern struct buffer_head *udf_tread(struct super_block *sb, udf_pblk_t block); extern struct genericFormat *udf_add_extendedattr(struct inode *, uint32_t, uint32_t, uint8_t); extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t, @@ -194,7 +190,7 @@ extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int); /* lowlevel.c */ extern unsigned int udf_get_last_session(struct super_block *); -extern unsigned long udf_get_last_block(struct super_block *); +udf_pblk_t udf_get_last_block(struct super_block *); /* partition.c */ extern uint32_t udf_get_pblock(struct super_block *, uint32_t, uint16_t, @@ -243,14 +239,13 @@ extern udf_pblk_t udf_new_block(struct super_block *sb, struct inode *inode, uint16_t partition, uint32_t goal, int *err); /* directory.c */ -extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *, - struct udf_fileident_bh *, - struct fileIdentDesc *, - struct extent_position *, - struct kernel_lb_addr *, uint32_t *, - sector_t *); -extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, - int *offset); +int udf_fiiter_init(struct udf_fileident_iter *iter, struct inode *dir, + loff_t pos); +int udf_fiiter_advance(struct udf_fileident_iter *iter); +void udf_fiiter_release(struct udf_fileident_iter *iter); +void udf_fiiter_write_fi(struct udf_fileident_iter *iter, uint8_t *impuse); +void udf_fiiter_update_elen(struct udf_fileident_iter *iter, uint32_t new_elen); +int udf_fiiter_append_blk(struct udf_fileident_iter *iter); extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 7e3e08c0166f..06bd84d555bd 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -289,7 +289,7 @@ cg_found: ufs_mark_sb_dirty(sb); inode->i_ino = cg * uspi->s_ipg + bit; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a873de7dec1c..a4246c83a8cd 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1212,14 +1212,14 @@ out: return err; } -int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ufs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; int error; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; @@ -1229,7 +1229,7 @@ int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return error; } - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 29d5a0e0c8f0..36154b5aca6d 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -69,7 +69,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ufs_create (struct user_namespace * mnt_userns, +static int ufs_create (struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { @@ -86,7 +86,7 @@ static int ufs_create (struct user_namespace * mnt_userns, return ufs_add_nondir(dentry, inode); } -static int ufs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int ufs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -106,7 +106,7 @@ static int ufs_mknod(struct user_namespace *mnt_userns, struct inode *dir, return err; } -static int ufs_symlink (struct user_namespace * mnt_userns, struct inode * dir, +static int ufs_symlink (struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; @@ -166,7 +166,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return error; } -static int ufs_mkdir(struct user_namespace * mnt_userns, struct inode * dir, +static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir, struct dentry * dentry, umode_t mode) { struct inode * inode; @@ -243,7 +243,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) return err; } -static int ufs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 550f7c5a3636..6b499180643b 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -123,7 +123,7 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long); extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_evict_inode (struct inode *); -extern int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int ufs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); /* namei.c */ diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cc694846617a..44d1ee429eb0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -113,7 +113,7 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, { const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; - vma->vm_flags = flags; + vm_flags_reset(vma, flags); /* * For shared mappings, we want to enable writenotify while * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply @@ -252,14 +252,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, unsigned long flags, unsigned long reason) { - struct mm_struct *mm = ctx->mm; pte_t *ptep, pte; bool ret = true; - mmap_assert_locked(mm); - - ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); + mmap_assert_locked(ctx->mm); + ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma)); if (!ptep) goto out; @@ -391,7 +389,8 @@ static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) */ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) { - struct mm_struct *mm = vmf->vma->vm_mm; + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; vm_fault_t ret = VM_FAULT_SIGBUS; @@ -418,7 +417,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) */ mmap_assert_locked(mm); - ctx = vmf->vma->vm_userfaultfd_ctx.ctx; + ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -508,6 +507,15 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) blocking_state = userfaultfd_get_blocking_state(vmf->flags); + /* + * Take the vma lock now, in order to safely call + * userfaultfd_huge_must_wait() later. Since acquiring the + * (sleepable) vma lock can modify the current task state, that + * must be before explicitly calling set_current_state(). + */ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_lock_read(vma); + spin_lock_irq(&ctx->fault_pending_wqh.lock); /* * After the __add_wait_queue the uwq is visible to userland @@ -522,13 +530,15 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) set_current_state(blocking_state); spin_unlock_irq(&ctx->fault_pending_wqh.lock); - if (!is_vm_hugetlb_page(vmf->vma)) + if (!is_vm_hugetlb_page(vma)) must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, + must_wait = userfaultfd_huge_must_wait(ctx, vma, vmf->address, vmf->flags, reason); + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_unlock_read(vma); mmap_read_unlock(mm); if (likely(must_wait && !READ_ONCE(ctx->released))) { @@ -873,7 +883,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); WRITE_ONCE(ctx->released, true); @@ -890,7 +900,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) */ mmap_write_lock(mm); prev = NULL; - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); @@ -899,13 +909,12 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { - mas_pause(&mas); vma = prev; } else { prev = vma; @@ -1292,7 +1301,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1334,17 +1343,13 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (!mmget_not_zero(mm)) goto out; + ret = -EINVAL; mmap_write_lock(mm); - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); if (!vma) goto out_unlock; - /* check that there's at least one vma in the range */ - ret = -EINVAL; - if (vma->vm_start >= end) - goto out_unlock; - /* * If the first vma contains huge pages, make sure start address * is aligned to huge page size. @@ -1361,7 +1366,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, */ found = false; basic_ioctls = false; - for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { + cur = vma; + do { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1418,16 +1424,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, basic_ioctls = true; found = true; - } + } for_each_vma_range(vmi, cur, end); BUG_ON(!found); - mas_set(&mas, start); - prev = mas_prev(&mas, 0); - if (prev != vma) - mas_next(&mas, ULONG_MAX); + vma_iter_set(&vmi, start); + prev = vma_prev(&vmi); ret = 0; - do { + for_each_vma_range(vmi, vma, end) { cond_resched(); BUG_ON(!vma_can_userfault(vma, vm_flags)); @@ -1448,30 +1452,25 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - prev = vma_merge(mm, prev, start, vma_end, new_flags, + prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); if (prev) { /* vma_merge() invalidated the mas */ - mas_pause(&mas); vma = prev; goto next; } if (vma->vm_start < start) { - ret = split_vma(mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret) break; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } if (vma->vm_end > end) { - ret = split_vma(mm, vma, end, 0); + ret = split_vma(&vmi, vma, end, 0); if (ret) break; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } next: /* @@ -1488,8 +1487,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = mas_next(&mas, end - 1); - } while (vma); + } + out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -1533,7 +1532,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1552,14 +1551,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); - if (!vma) - goto out_unlock; - - /* check that there's at least one vma in the range */ ret = -EINVAL; - if (vma->vm_start >= end) + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); + if (!vma) goto out_unlock; /* @@ -1577,8 +1572,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * Search for not compatible vmas. */ found = false; - ret = -EINVAL; - for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { + cur = vma; + do { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1595,16 +1590,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out_unlock; found = true; - } + } for_each_vma_range(vmi, cur, end); BUG_ON(!found); - mas_set(&mas, start); - prev = mas_prev(&mas, 0); - if (prev != vma) - mas_next(&mas, ULONG_MAX); - + vma_iter_set(&vmi, start); + prev = vma_prev(&vmi); ret = 0; - do { + for_each_vma_range(vmi, vma, end) { cond_resched(); BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); @@ -1640,26 +1632,23 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(mm, vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(mm, prev, start, vma_end, new_flags, + prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { vma = prev; - mas_pause(&mas); goto next; } if (vma->vm_start < start) { - ret = split_vma(mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret) break; - mas_pause(&mas); } if (vma->vm_end > end) { - ret = split_vma(mm, vma, end, 0); + ret = split_vma(&vmi, vma, end, 0); if (ret) break; - mas_pause(&mas); } next: /* @@ -1673,8 +1662,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = mas_next(&mas, end - 1); - } while (vma); + } + out_unlock: mmap_write_unlock(mm); mmput(mm); diff --git a/fs/utimes.c b/fs/utimes.c index 39f356017635..3701b3946f88 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -7,6 +7,7 @@ #include <linux/uaccess.h> #include <linux/compat.h> #include <asm/unistd.h> +#include <linux/filelock.h> static bool nsec_valid(long nsec) { @@ -62,7 +63,7 @@ int vfs_utimes(const struct path *path, struct timespec64 *times) } retry_deleg: inode_lock(inode); - error = notify_change(mnt_user_ns(path->mnt), path->dentry, &newattrs, + error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index c4769a9396c5..075f15c43c78 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -294,14 +294,14 @@ out: return err; } -static int vboxsf_dir_mkfile(struct user_namespace *mnt_userns, +static int vboxsf_dir_mkfile(struct mnt_idmap *idmap, struct inode *parent, struct dentry *dentry, umode_t mode, bool excl) { return vboxsf_dir_create(parent, dentry, mode, false, excl, NULL); } -static int vboxsf_dir_mkdir(struct user_namespace *mnt_userns, +static int vboxsf_dir_mkdir(struct mnt_idmap *idmap, struct inode *parent, struct dentry *dentry, umode_t mode) { @@ -387,7 +387,7 @@ static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry) return 0; } -static int vboxsf_dir_rename(struct user_namespace *mnt_userns, +static int vboxsf_dir_rename(struct mnt_idmap *idmap, struct inode *old_parent, struct dentry *old_dentry, struct inode *new_parent, @@ -430,7 +430,7 @@ err_put_old_path: return err; } -static int vboxsf_dir_symlink(struct user_namespace *mnt_userns, +static int vboxsf_dir_symlink(struct mnt_idmap *idmap, struct inode *parent, struct dentry *dentry, const char *symname) { diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index e1db0f3f7e5e..dd0ae1188e87 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -231,7 +231,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry) return 0; } -int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, +int vboxsf_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *kstat, u32 request_mask, unsigned int flags) { int err; @@ -252,11 +252,11 @@ int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, if (err) return err; - generic_fillattr(&init_user_ns, d_inode(dentry), kstat); + generic_fillattr(&nop_mnt_idmap, d_inode(dentry), kstat); return 0; } -int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int vboxsf_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct vboxsf_inode *sf_i = VBOXSF_I(d_inode(dentry)); diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h index 9047befa66c5..05973eb89d52 100644 --- a/fs/vboxsf/vfsmod.h +++ b/fs/vboxsf/vfsmod.h @@ -97,10 +97,10 @@ int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path, struct shfl_fsobjinfo *info); int vboxsf_stat_dentry(struct dentry *dentry, struct shfl_fsobjinfo *info); int vboxsf_inode_revalidate(struct dentry *dentry); -int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, +int vboxsf_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *kstat, u32 request_mask, unsigned int query_flags); -int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int vboxsf_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); struct shfl_string *vboxsf_path_from_dentry(struct vboxsf_sbi *sbi, struct dentry *dentry); diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index aad1f1d998b9..a7ffd718f171 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -34,14 +34,6 @@ config FS_VERITY If unsure, say N. -config FS_VERITY_DEBUG - bool "FS Verity debugging" - depends on FS_VERITY - help - Enable debugging messages related to fs-verity by default. - - Say N unless you are an fs-verity developer. - config FS_VERITY_BUILTIN_SIGNATURES bool "FS Verity builtin signature support" depends on FS_VERITY diff --git a/fs/verity/enable.c b/fs/verity/enable.c index df6b499bf6a1..e13db6507b38 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -7,136 +7,50 @@ #include "fsverity_private.h" -#include <crypto/hash.h> -#include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> -/* - * Read a file data page for Merkle tree construction. Do aggressive readahead, - * since we're sequentially reading the entire file. - */ -static struct page *read_file_data_page(struct file *file, pgoff_t index, - struct file_ra_state *ra, - unsigned long remaining_pages) -{ - DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, index); - struct folio *folio; - - folio = __filemap_get_folio(ractl.mapping, index, FGP_ACCESSED, 0); - if (!folio || !folio_test_uptodate(folio)) { - if (folio) - folio_put(folio); - else - page_cache_sync_ra(&ractl, remaining_pages); - folio = read_cache_folio(ractl.mapping, index, NULL, file); - if (IS_ERR(folio)) - return &folio->page; - } - if (folio_test_readahead(folio)) - page_cache_async_ra(&ractl, folio, remaining_pages); - return folio_file_page(folio, index); -} +struct block_buffer { + u32 filled; + u8 *data; +}; -static int build_merkle_tree_level(struct file *filp, unsigned int level, - u64 num_blocks_to_hash, - const struct merkle_tree_params *params, - u8 *pending_hashes, - struct ahash_request *req) +/* Hash a block, writing the result to the next level's pending block buffer. */ +static int hash_one_block(struct inode *inode, + const struct merkle_tree_params *params, + struct ahash_request *req, struct block_buffer *cur) { - struct inode *inode = file_inode(filp); - const struct fsverity_operations *vops = inode->i_sb->s_vop; - struct file_ra_state ra = { 0 }; - unsigned int pending_size = 0; - u64 dst_block_num; - u64 i; + struct block_buffer *next = cur + 1; int err; - if (WARN_ON(params->block_size != PAGE_SIZE)) /* checked earlier too */ - return -EINVAL; - - if (level < params->num_levels) { - dst_block_num = params->level_start[level]; - } else { - if (WARN_ON(num_blocks_to_hash != 1)) - return -EINVAL; - dst_block_num = 0; /* unused */ - } + /* Zero-pad the block if it's shorter than the block size. */ + memset(&cur->data[cur->filled], 0, params->block_size - cur->filled); - file_ra_state_init(&ra, filp->f_mapping); - - for (i = 0; i < num_blocks_to_hash; i++) { - struct page *src_page; - - if ((pgoff_t)i % 10000 == 0 || i + 1 == num_blocks_to_hash) - pr_debug("Hashing block %llu of %llu for level %u\n", - i + 1, num_blocks_to_hash, level); - - if (level == 0) { - /* Leaf: hashing a data block */ - src_page = read_file_data_page(filp, i, &ra, - num_blocks_to_hash - i); - if (IS_ERR(src_page)) { - err = PTR_ERR(src_page); - fsverity_err(inode, - "Error %d reading data page %llu", - err, i); - return err; - } - } else { - unsigned long num_ra_pages = - min_t(unsigned long, num_blocks_to_hash - i, - inode->i_sb->s_bdi->io_pages); - - /* Non-leaf: hashing hash block from level below */ - src_page = vops->read_merkle_tree_page(inode, - params->level_start[level - 1] + i, - num_ra_pages); - if (IS_ERR(src_page)) { - err = PTR_ERR(src_page); - fsverity_err(inode, - "Error %d reading Merkle tree page %llu", - err, params->level_start[level - 1] + i); - return err; - } - } + err = fsverity_hash_block(params, inode, req, virt_to_page(cur->data), + offset_in_page(cur->data), + &next->data[next->filled]); + if (err) + return err; + next->filled += params->digest_size; + cur->filled = 0; + return 0; +} - err = fsverity_hash_page(params, inode, req, src_page, - &pending_hashes[pending_size]); - put_page(src_page); - if (err) - return err; - pending_size += params->digest_size; - - if (level == params->num_levels) /* Root hash? */ - return 0; - - if (pending_size + params->digest_size > params->block_size || - i + 1 == num_blocks_to_hash) { - /* Flush the pending hash block */ - memset(&pending_hashes[pending_size], 0, - params->block_size - pending_size); - err = vops->write_merkle_tree_block(inode, - pending_hashes, - dst_block_num, - params->log_blocksize); - if (err) { - fsverity_err(inode, - "Error %d writing Merkle tree block %llu", - err, dst_block_num); - return err; - } - dst_block_num++; - pending_size = 0; - } +static int write_merkle_tree_block(struct inode *inode, const u8 *buf, + unsigned long index, + const struct merkle_tree_params *params) +{ + u64 pos = (u64)index << params->log_blocksize; + int err; - if (fatal_signal_pending(current)) - return -EINTR; - cond_resched(); - } - return 0; + err = inode->i_sb->s_vop->write_merkle_tree_block(inode, buf, pos, + params->block_size); + if (err) + fsverity_err(inode, "Error %d writing Merkle tree block %lu", + err, index); + return err; } /* @@ -152,13 +66,17 @@ static int build_merkle_tree(struct file *filp, u8 *root_hash) { struct inode *inode = file_inode(filp); - u8 *pending_hashes; + const u64 data_size = inode->i_size; + const int num_levels = params->num_levels; struct ahash_request *req; - u64 blocks; - unsigned int level; - int err = -ENOMEM; + struct block_buffer _buffers[1 + FS_VERITY_MAX_LEVELS + 1] = {}; + struct block_buffer *buffers = &_buffers[1]; + unsigned long level_offset[FS_VERITY_MAX_LEVELS]; + int level; + u64 offset; + int err; - if (inode->i_size == 0) { + if (data_size == 0) { /* Empty file is a special case; root hash is all 0's */ memset(root_hash, 0, params->digest_size); return 0; @@ -167,29 +85,95 @@ static int build_merkle_tree(struct file *filp, /* This allocation never fails, since it's mempool-backed. */ req = fsverity_alloc_hash_request(params->hash_alg, GFP_KERNEL); - pending_hashes = kmalloc(params->block_size, GFP_KERNEL); - if (!pending_hashes) - goto out; - /* - * Build each level of the Merkle tree, starting at the leaf level - * (level 0) and ascending to the root node (level 'num_levels - 1'). - * Then at the end (level 'num_levels'), calculate the root hash. + * Allocate the block buffers. Buffer "-1" is for data blocks. + * Buffers 0 <= level < num_levels are for the actual tree levels. + * Buffer 'num_levels' is for the root hash. */ - blocks = ((u64)inode->i_size + params->block_size - 1) >> - params->log_blocksize; - for (level = 0; level <= params->num_levels; level++) { - err = build_merkle_tree_level(filp, level, blocks, params, - pending_hashes, req); + for (level = -1; level < num_levels; level++) { + buffers[level].data = kzalloc(params->block_size, GFP_KERNEL); + if (!buffers[level].data) { + err = -ENOMEM; + goto out; + } + } + buffers[num_levels].data = root_hash; + + BUILD_BUG_ON(sizeof(level_offset) != sizeof(params->level_start)); + memcpy(level_offset, params->level_start, sizeof(level_offset)); + + /* Hash each data block, also hashing the tree blocks as they fill up */ + for (offset = 0; offset < data_size; offset += params->block_size) { + ssize_t bytes_read; + loff_t pos = offset; + + buffers[-1].filled = min_t(u64, params->block_size, + data_size - offset); + bytes_read = __kernel_read(filp, buffers[-1].data, + buffers[-1].filled, &pos); + if (bytes_read < 0) { + err = bytes_read; + fsverity_err(inode, "Error %d reading file data", err); + goto out; + } + if (bytes_read != buffers[-1].filled) { + err = -EINVAL; + fsverity_err(inode, "Short read of file data"); + goto out; + } + err = hash_one_block(inode, params, req, &buffers[-1]); if (err) goto out; - blocks = (blocks + params->hashes_per_block - 1) >> - params->log_arity; + for (level = 0; level < num_levels; level++) { + if (buffers[level].filled + params->digest_size <= + params->block_size) { + /* Next block at @level isn't full yet */ + break; + } + /* Next block at @level is full */ + + err = hash_one_block(inode, params, req, + &buffers[level]); + if (err) + goto out; + err = write_merkle_tree_block(inode, + buffers[level].data, + level_offset[level], + params); + if (err) + goto out; + level_offset[level]++; + } + if (fatal_signal_pending(current)) { + err = -EINTR; + goto out; + } + cond_resched(); + } + /* Finish all nonempty pending tree blocks. */ + for (level = 0; level < num_levels; level++) { + if (buffers[level].filled != 0) { + err = hash_one_block(inode, params, req, + &buffers[level]); + if (err) + goto out; + err = write_merkle_tree_block(inode, + buffers[level].data, + level_offset[level], + params); + if (err) + goto out; + } + } + /* The root hash was filled by the last call to hash_one_block(). */ + if (WARN_ON(buffers[num_levels].filled != params->digest_size)) { + err = -EINVAL; + goto out; } - memcpy(root_hash, pending_hashes, params->digest_size); err = 0; out: - kfree(pending_hashes); + for (level = -1; level < num_levels; level++) + kfree(buffers[level].data); fsverity_free_hash_request(params->hash_alg, req); return err; } @@ -263,15 +247,12 @@ static int enable_verity(struct file *filp, * ->begin_enable_verity() and ->end_enable_verity() using the inode * lock and only allow one process to be here at a time on a given file. */ - pr_debug("Building Merkle tree...\n"); BUILD_BUG_ON(sizeof(desc->root_hash) < FS_VERITY_MAX_DIGEST_SIZE); err = build_merkle_tree(filp, ¶ms, desc->root_hash); if (err) { fsverity_err(inode, "Error %d building Merkle tree", err); goto rollback; } - pr_debug("Done building Merkle tree. Root hash is %s:%*phN\n", - params.hash_alg->name, params.digest_size, desc->root_hash); /* * Create the fsverity_info. Don't bother trying to save work by @@ -286,10 +267,6 @@ static int enable_verity(struct file *filp, goto rollback; } - if (arg->sig_size) - pr_debug("Storing a %u-byte PKCS#7 signature alongside the file\n", - arg->sig_size); - /* * Tell the filesystem to finish enabling verity on the file. * Serialized with ->begin_enable_verity() by the inode lock. @@ -352,7 +329,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) memchr_inv(arg.__reserved2, 0, sizeof(arg.__reserved2))) return -EINVAL; - if (arg.block_size != PAGE_SIZE) + if (!is_power_of_2(arg.block_size)) return -EINVAL; if (arg.salt_size > sizeof_field(struct fsverity_descriptor, salt)) diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index c7fcb855e068..d34dcc033d72 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -8,10 +8,6 @@ #ifndef _FSVERITY_PRIVATE_H #define _FSVERITY_PRIVATE_H -#ifdef CONFIG_FS_VERITY_DEBUG -#define DEBUG -#endif - #define pr_fmt(fmt) "fs-verity: " fmt #include <linux/fsverity.h> @@ -46,17 +42,20 @@ struct merkle_tree_params { unsigned int digest_size; /* same as hash_alg->digest_size */ unsigned int block_size; /* size of data and tree blocks */ unsigned int hashes_per_block; /* number of hashes per tree block */ - unsigned int log_blocksize; /* log2(block_size) */ - unsigned int log_arity; /* log2(hashes_per_block) */ + unsigned int blocks_per_page; /* PAGE_SIZE / block_size */ + u8 log_digestsize; /* log2(digest_size) */ + u8 log_blocksize; /* log2(block_size) */ + u8 log_arity; /* log2(hashes_per_block) */ + u8 log_blocks_per_page; /* log2(blocks_per_page) */ unsigned int num_levels; /* number of levels in Merkle tree */ u64 tree_size; /* Merkle tree size in bytes */ - unsigned long level0_blocks; /* number of blocks in tree level 0 */ + unsigned long tree_pages; /* Merkle tree size in pages */ /* * Starting block index for each tree level, ordered from leaf level (0) * to root level ('num_levels - 1') */ - u64 level_start[FS_VERITY_MAX_LEVELS]; + unsigned long level_start[FS_VERITY_MAX_LEVELS]; }; /* @@ -73,9 +72,10 @@ struct fsverity_info { u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE]; u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE]; const struct inode *inode; + unsigned long *hash_block_verified; + spinlock_t hash_page_init_lock; }; - #define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \ sizeof(struct fsverity_descriptor)) @@ -91,9 +91,9 @@ void fsverity_free_hash_request(struct fsverity_hash_alg *alg, struct ahash_request *req); const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size); -int fsverity_hash_page(const struct merkle_tree_params *params, - const struct inode *inode, - struct ahash_request *req, struct page *page, u8 *out); +int fsverity_hash_block(const struct merkle_tree_params *params, + const struct inode *inode, struct ahash_request *req, + struct page *page, unsigned int offset, u8 *out); int fsverity_hash_buffer(struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out); void __init fsverity_check_hash_algs(void); diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 6f8170cf4ae7..13fcf31be844 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -220,35 +220,33 @@ err_free: } /** - * fsverity_hash_page() - hash a single data or hash page + * fsverity_hash_block() - hash a single data or hash block * @params: the Merkle tree's parameters * @inode: inode for which the hashing is being done * @req: preallocated hash request - * @page: the page to hash + * @page: the page containing the block to hash + * @offset: the offset of the block within @page * @out: output digest, size 'params->digest_size' bytes * - * Hash a single data or hash block, assuming block_size == PAGE_SIZE. - * The hash is salted if a salt is specified in the Merkle tree parameters. + * Hash a single data or hash block. The hash is salted if a salt is specified + * in the Merkle tree parameters. * * Return: 0 on success, -errno on failure */ -int fsverity_hash_page(const struct merkle_tree_params *params, - const struct inode *inode, - struct ahash_request *req, struct page *page, u8 *out) +int fsverity_hash_block(const struct merkle_tree_params *params, + const struct inode *inode, struct ahash_request *req, + struct page *page, unsigned int offset, u8 *out) { struct scatterlist sg; DECLARE_CRYPTO_WAIT(wait); int err; - if (WARN_ON(params->block_size != PAGE_SIZE)) - return -EINVAL; - sg_init_table(&sg, 1); - sg_set_page(&sg, page, PAGE_SIZE, 0); + sg_set_page(&sg, page, params->block_size, offset); ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &wait); - ahash_request_set_crypt(req, &sg, out, PAGE_SIZE); + ahash_request_set_crypt(req, &sg, out, params->block_size); if (params->hashstate) { err = crypto_ahash_import(req, params->hashstate); @@ -264,7 +262,7 @@ int fsverity_hash_page(const struct merkle_tree_params *params, err = crypto_wait_req(err, &wait); if (err) - fsverity_err(inode, "Error %d computing page hash", err); + fsverity_err(inode, "Error %d computing block hash", err); return err; } diff --git a/fs/verity/init.c b/fs/verity/init.c index c98b7016f446..023905151035 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -49,7 +49,6 @@ static int __init fsverity_init(void) if (err) goto err_exit_workqueue; - pr_debug("Initialized fs-verity\n"); return 0; err_exit_workqueue: diff --git a/fs/verity/open.c b/fs/verity/open.c index 81ff94442f7b..9366b441d01c 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -7,6 +7,7 @@ #include "fsverity_private.h" +#include <linux/mm.h> #include <linux/slab.h> static struct kmem_cache *fsverity_info_cachep; @@ -34,6 +35,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, struct fsverity_hash_alg *hash_alg; int err; u64 blocks; + u64 blocks_in_level[FS_VERITY_MAX_LEVELS]; u64 offset; int level; @@ -54,7 +56,23 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, goto out_err; } - if (log_blocksize != PAGE_SHIFT) { + /* + * fs/verity/ directly assumes that the Merkle tree block size is a + * power of 2 less than or equal to PAGE_SIZE. Another restriction + * arises from the interaction between fs/verity/ and the filesystems + * themselves: filesystems expect to be able to verify a single + * filesystem block of data at a time. Therefore, the Merkle tree block + * size must also be less than or equal to the filesystem block size. + * + * The above are the only hard limitations, so in theory the Merkle tree + * block size could be as small as twice the digest size. However, + * that's not useful, and it would result in some unusually deep and + * large Merkle trees. So we currently require that the Merkle tree + * block size be at least 1024 bytes. That's small enough to test the + * sub-page block case on systems with 4K pages, but not too small. + */ + if (log_blocksize < 10 || log_blocksize > PAGE_SHIFT || + log_blocksize > inode->i_blkbits) { fsverity_warn(inode, "Unsupported log_blocksize: %u", log_blocksize); err = -EINVAL; @@ -62,6 +80,8 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, } params->log_blocksize = log_blocksize; params->block_size = 1 << log_blocksize; + params->log_blocks_per_page = PAGE_SHIFT - log_blocksize; + params->blocks_per_page = 1 << params->log_blocks_per_page; if (WARN_ON(!is_power_of_2(params->digest_size))) { err = -EINVAL; @@ -74,13 +94,10 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, err = -EINVAL; goto out_err; } - params->log_arity = params->log_blocksize - ilog2(params->digest_size); + params->log_digestsize = ilog2(params->digest_size); + params->log_arity = log_blocksize - params->log_digestsize; params->hashes_per_block = 1 << params->log_arity; - pr_debug("Merkle tree uses %s with %u-byte blocks (%u hashes/block), salt=%*phN\n", - hash_alg->name, params->block_size, params->hashes_per_block, - (int)salt_size, salt); - /* * Compute the number of levels in the Merkle tree and create a map from * level to the starting block of that level. Level 'num_levels - 1' is @@ -90,31 +107,45 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, /* Compute number of levels and the number of blocks in each level */ blocks = ((u64)inode->i_size + params->block_size - 1) >> log_blocksize; - pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks); while (blocks > 1) { if (params->num_levels >= FS_VERITY_MAX_LEVELS) { fsverity_err(inode, "Too many levels in Merkle tree"); - err = -EINVAL; + err = -EFBIG; goto out_err; } blocks = (blocks + params->hashes_per_block - 1) >> params->log_arity; - /* temporarily using level_start[] to store blocks in level */ - params->level_start[params->num_levels++] = blocks; + blocks_in_level[params->num_levels++] = blocks; } - params->level0_blocks = params->level_start[0]; /* Compute the starting block of each level */ offset = 0; for (level = (int)params->num_levels - 1; level >= 0; level--) { - blocks = params->level_start[level]; params->level_start[level] = offset; - pr_debug("Level %d is %llu blocks starting at index %llu\n", - level, blocks, offset); - offset += blocks; + offset += blocks_in_level[level]; + } + + /* + * With block_size != PAGE_SIZE, an in-memory bitmap will need to be + * allocated to track the "verified" status of hash blocks. Don't allow + * this bitmap to get too large. For now, limit it to 1 MiB, which + * limits the file size to about 4.4 TB with SHA-256 and 4K blocks. + * + * Together with the fact that the data, and thus also the Merkle tree, + * cannot have more than ULONG_MAX pages, this implies that hash block + * indices can always fit in an 'unsigned long'. But to be safe, we + * explicitly check for that too. Note, this is only for hash block + * indices; data block indices might not fit in an 'unsigned long'. + */ + if ((params->block_size != PAGE_SIZE && offset > 1 << 23) || + offset > ULONG_MAX) { + fsverity_err(inode, "Too many blocks in Merkle tree"); + err = -EFBIG; + goto out_err; } params->tree_size = offset << log_blocksize; + params->tree_pages = PAGE_ALIGN(params->tree_size) >> PAGE_SHIFT; return 0; out_err: @@ -165,7 +196,7 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, fsverity_err(inode, "Error %d initializing Merkle tree parameters", err); - goto out; + goto fail; } memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size); @@ -174,20 +205,48 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, vi->file_digest); if (err) { fsverity_err(inode, "Error %d computing file digest", err); - goto out; + goto fail; } - pr_debug("Computed file digest: %s:%*phN\n", - vi->tree_params.hash_alg->name, - vi->tree_params.digest_size, vi->file_digest); err = fsverity_verify_signature(vi, desc->signature, le32_to_cpu(desc->sig_size)); -out: - if (err) { - fsverity_free_info(vi); - vi = ERR_PTR(err); + if (err) + goto fail; + + if (vi->tree_params.block_size != PAGE_SIZE) { + /* + * When the Merkle tree block size and page size differ, we use + * a bitmap to keep track of which hash blocks have been + * verified. This bitmap must contain one bit per hash block, + * including alignment to a page boundary at the end. + * + * Eventually, to support extremely large files in an efficient + * way, it might be necessary to make pages of this bitmap + * reclaimable. But for now, simply allocating the whole bitmap + * is a simple solution that works well on the files on which + * fsverity is realistically used. E.g., with SHA-256 and 4K + * blocks, a 100MB file only needs a 24-byte bitmap, and the + * bitmap for any file under 17GB fits in a 4K page. + */ + unsigned long num_bits = + vi->tree_params.tree_pages << + vi->tree_params.log_blocks_per_page; + + vi->hash_block_verified = kvcalloc(BITS_TO_LONGS(num_bits), + sizeof(unsigned long), + GFP_KERNEL); + if (!vi->hash_block_verified) { + err = -ENOMEM; + goto fail; + } + spin_lock_init(&vi->hash_page_init_lock); } + return vi; + +fail: + fsverity_free_info(vi); + return ERR_PTR(err); } void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) @@ -214,6 +273,7 @@ void fsverity_free_info(struct fsverity_info *vi) if (!vi) return; kfree(vi->tree_params.hashstate); + kvfree(vi->hash_block_verified); kmem_cache_free(fsverity_info_cachep, vi); } @@ -325,67 +385,28 @@ out_free_desc: return err; } -/** - * fsverity_file_open() - prepare to open a verity file - * @inode: the inode being opened - * @filp: the struct file being set up - * - * When opening a verity file, deny the open if it is for writing. Otherwise, - * set up the inode's ->i_verity_info if not already done. - * - * When combined with fscrypt, this must be called after fscrypt_file_open(). - * Otherwise, we won't have the key set up to decrypt the verity metadata. - * - * Return: 0 on success, -errno on failure - */ -int fsverity_file_open(struct inode *inode, struct file *filp) +int __fsverity_file_open(struct inode *inode, struct file *filp) { - if (!IS_VERITY(inode)) - return 0; - - if (filp->f_mode & FMODE_WRITE) { - pr_debug("Denying opening verity file (ino %lu) for write\n", - inode->i_ino); + if (filp->f_mode & FMODE_WRITE) return -EPERM; - } - return ensure_verity_info(inode); } -EXPORT_SYMBOL_GPL(fsverity_file_open); +EXPORT_SYMBOL_GPL(__fsverity_file_open); -/** - * fsverity_prepare_setattr() - prepare to change a verity inode's attributes - * @dentry: dentry through which the inode is being changed - * @attr: attributes to change - * - * Verity files are immutable, so deny truncates. This isn't covered by the - * open-time check because sys_truncate() takes a path, not a file descriptor. - * - * Return: 0 on success, -errno on failure - */ -int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) +int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { - if (IS_VERITY(d_inode(dentry)) && (attr->ia_valid & ATTR_SIZE)) { - pr_debug("Denying truncate of verity file (ino %lu)\n", - d_inode(dentry)->i_ino); + if (attr->ia_valid & ATTR_SIZE) return -EPERM; - } return 0; } -EXPORT_SYMBOL_GPL(fsverity_prepare_setattr); +EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr); -/** - * fsverity_cleanup_inode() - free the inode's verity info, if present - * @inode: an inode being evicted - * - * Filesystems must call this on inode eviction to free ->i_verity_info. - */ -void fsverity_cleanup_inode(struct inode *inode) +void __fsverity_cleanup_inode(struct inode *inode) { fsverity_free_info(inode->i_verity_info); inode->i_verity_info = NULL; } -EXPORT_SYMBOL_GPL(fsverity_cleanup_inode); +EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); int __init fsverity_init_info_cache(void) { diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 143a530a8008..e7d3ca919a1e 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -82,8 +82,6 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return err; } - pr_debug("Valid signature for file digest %s:%*phN\n", - hash_alg->name, hash_alg->digest_size, vi->file_digest); return 0; } diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 961ba248021f..f50e3b5b52c9 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -9,39 +9,12 @@ #include <crypto/hash.h> #include <linux/bio.h> -#include <linux/ratelimit.h> static struct workqueue_struct *fsverity_read_workqueue; -/** - * hash_at_level() - compute the location of the block's hash at the given level - * - * @params: (in) the Merkle tree parameters - * @dindex: (in) the index of the data block being verified - * @level: (in) the level of hash we want (0 is leaf level) - * @hindex: (out) the index of the hash block containing the wanted hash - * @hoffset: (out) the byte offset to the wanted hash within the hash block - */ -static void hash_at_level(const struct merkle_tree_params *params, - pgoff_t dindex, unsigned int level, pgoff_t *hindex, - unsigned int *hoffset) -{ - pgoff_t position; - - /* Offset of the hash within the level's region, in hashes */ - position = dindex >> (level * params->log_arity); - - /* Index of the hash block in the tree overall */ - *hindex = params->level_start[level] + (position >> params->log_arity); - - /* Offset of the wanted hash (in bytes) within the hash block */ - *hoffset = (position & ((1 << params->log_arity) - 1)) << - (params->log_blocksize - params->log_arity); -} - static inline int cmp_hashes(const struct fsverity_info *vi, const u8 *want_hash, const u8 *real_hash, - pgoff_t index, int level) + u64 data_pos, int level) { const unsigned int hsize = vi->tree_params.digest_size; @@ -49,159 +22,312 @@ static inline int cmp_hashes(const struct fsverity_info *vi, return 0; fsverity_err(vi->inode, - "FILE CORRUPTED! index=%lu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", - index, level, + "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", + data_pos, level, vi->tree_params.hash_alg->name, hsize, want_hash, vi->tree_params.hash_alg->name, hsize, real_hash); return -EBADMSG; } +static bool data_is_zeroed(struct inode *inode, struct page *page, + unsigned int len, unsigned int offset) +{ + void *virt = kmap_local_page(page); + + if (memchr_inv(virt + offset, 0, len)) { + kunmap_local(virt); + fsverity_err(inode, + "FILE CORRUPTED! Data past EOF is not zeroed"); + return false; + } + kunmap_local(virt); + return true; +} + +/* + * Returns true if the hash block with index @hblock_idx in the tree, located in + * @hpage, has already been verified. + */ +static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, + unsigned long hblock_idx) +{ + bool verified; + unsigned int blocks_per_page; + unsigned int i; + + /* + * When the Merkle tree block size and page size are the same, then the + * ->hash_block_verified bitmap isn't allocated, and we use PG_checked + * to directly indicate whether the page's block has been verified. + * + * Using PG_checked also guarantees that we re-verify hash pages that + * get evicted and re-instantiated from the backing storage, as new + * pages always start out with PG_checked cleared. + */ + if (!vi->hash_block_verified) + return PageChecked(hpage); + + /* + * When the Merkle tree block size and page size differ, we use a bitmap + * to indicate whether each hash block has been verified. + * + * However, we still need to ensure that hash pages that get evicted and + * re-instantiated from the backing storage are re-verified. To do + * this, we use PG_checked again, but now it doesn't really mean + * "checked". Instead, now it just serves as an indicator for whether + * the hash page is newly instantiated or not. + * + * The first thread that sees PG_checked=0 must clear the corresponding + * bitmap bits, then set PG_checked=1. This requires a spinlock. To + * avoid having to take this spinlock in the common case of + * PG_checked=1, we start with an opportunistic lockless read. + */ + if (PageChecked(hpage)) { + /* + * A read memory barrier is needed here to give ACQUIRE + * semantics to the above PageChecked() test. + */ + smp_rmb(); + return test_bit(hblock_idx, vi->hash_block_verified); + } + spin_lock(&vi->hash_page_init_lock); + if (PageChecked(hpage)) { + verified = test_bit(hblock_idx, vi->hash_block_verified); + } else { + blocks_per_page = vi->tree_params.blocks_per_page; + hblock_idx = round_down(hblock_idx, blocks_per_page); + for (i = 0; i < blocks_per_page; i++) + clear_bit(hblock_idx + i, vi->hash_block_verified); + /* + * A write memory barrier is needed here to give RELEASE + * semantics to the below SetPageChecked() operation. + */ + smp_wmb(); + SetPageChecked(hpage); + verified = false; + } + spin_unlock(&vi->hash_page_init_lock); + return verified; +} + /* - * Verify a single data page against the file's Merkle tree. + * Verify a single data block against the file's Merkle tree. * * In principle, we need to verify the entire path to the root node. However, - * for efficiency the filesystem may cache the hash pages. Therefore we need - * only ascend the tree until an already-verified page is seen, as indicated by - * the PageChecked bit being set; then verify the path to that page. - * - * This code currently only supports the case where the verity block size is - * equal to PAGE_SIZE. Doing otherwise would be possible but tricky, since we - * wouldn't be able to use the PageChecked bit. - * - * Note that multiple processes may race to verify a hash page and mark it - * Checked, but it doesn't matter; the result will be the same either way. + * for efficiency the filesystem may cache the hash blocks. Therefore we need + * only ascend the tree until an already-verified hash block is seen, and then + * verify the path to that block. * - * Return: true if the page is valid, else false. + * Return: %true if the data block is valid, else %false. */ -static bool verify_page(struct inode *inode, const struct fsverity_info *vi, - struct ahash_request *req, struct page *data_page, - unsigned long level0_ra_pages) +static bool +verify_data_block(struct inode *inode, struct fsverity_info *vi, + struct ahash_request *req, struct page *data_page, + u64 data_pos, unsigned int dblock_offset_in_page, + unsigned long max_ra_pages) { const struct merkle_tree_params *params = &vi->tree_params; const unsigned int hsize = params->digest_size; - const pgoff_t index = data_page->index; int level; u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE]; const u8 *want_hash; u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE]; - struct page *hpages[FS_VERITY_MAX_LEVELS]; - unsigned int hoffsets[FS_VERITY_MAX_LEVELS]; + /* The hash blocks that are traversed, indexed by level */ + struct { + /* Page containing the hash block */ + struct page *page; + /* Index of the hash block in the tree overall */ + unsigned long index; + /* Byte offset of the hash block within @page */ + unsigned int offset_in_page; + /* Byte offset of the wanted hash within @page */ + unsigned int hoffset; + } hblocks[FS_VERITY_MAX_LEVELS]; + /* + * The index of the previous level's block within that level; also the + * index of that block's hash within the current level. + */ + u64 hidx = data_pos >> params->log_blocksize; int err; - if (WARN_ON_ONCE(!PageLocked(data_page) || PageUptodate(data_page))) - return false; - - pr_debug_ratelimited("Verifying data page %lu...\n", index); + if (unlikely(data_pos >= inode->i_size)) { + /* + * This can happen in the data page spanning EOF when the Merkle + * tree block size is less than the page size. The Merkle tree + * doesn't cover data blocks fully past EOF. But the entire + * page spanning EOF can be visible to userspace via a mmap, and + * any part past EOF should be all zeroes. Therefore, we need + * to verify that any data blocks fully past EOF are all zeroes. + */ + return data_is_zeroed(inode, data_page, params->block_size, + dblock_offset_in_page); + } /* - * Starting at the leaf level, ascend the tree saving hash pages along - * the way until we find a verified hash page, indicated by PageChecked; - * or until we reach the root. + * Starting at the leaf level, ascend the tree saving hash blocks along + * the way until we find a hash block that has already been verified, or + * until we reach the root. */ for (level = 0; level < params->num_levels; level++) { - pgoff_t hindex; + unsigned long next_hidx; + unsigned long hblock_idx; + pgoff_t hpage_idx; + unsigned int hblock_offset_in_page; unsigned int hoffset; struct page *hpage; - hash_at_level(params, index, level, &hindex, &hoffset); + /* + * The index of the block in the current level; also the index + * of that block's hash within the next level. + */ + next_hidx = hidx >> params->log_arity; + + /* Index of the hash block in the tree overall */ + hblock_idx = params->level_start[level] + next_hidx; + + /* Index of the hash page in the tree overall */ + hpage_idx = hblock_idx >> params->log_blocks_per_page; - pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n", - level, hindex, hoffset); + /* Byte offset of the hash block within the page */ + hblock_offset_in_page = + (hblock_idx << params->log_blocksize) & ~PAGE_MASK; - hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, hindex, - level == 0 ? level0_ra_pages : 0); + /* Byte offset of the hash within the page */ + hoffset = hblock_offset_in_page + + ((hidx << params->log_digestsize) & + (params->block_size - 1)); + + hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, + hpage_idx, level == 0 ? min(max_ra_pages, + params->tree_pages - hpage_idx) : 0); if (IS_ERR(hpage)) { err = PTR_ERR(hpage); fsverity_err(inode, "Error %d reading Merkle tree page %lu", - err, hindex); + err, hpage_idx); goto out; } - - if (PageChecked(hpage)) { + if (is_hash_block_verified(vi, hpage, hblock_idx)) { memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); - pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n", - params->hash_alg->name, - hsize, want_hash); goto descend; } - pr_debug_ratelimited("Hash page not yet checked\n"); - hpages[level] = hpage; - hoffsets[level] = hoffset; + hblocks[level].page = hpage; + hblocks[level].index = hblock_idx; + hblocks[level].offset_in_page = hblock_offset_in_page; + hblocks[level].hoffset = hoffset; + hidx = next_hidx; } want_hash = vi->root_hash; - pr_debug("Want root hash: %s:%*phN\n", - params->hash_alg->name, hsize, want_hash); descend: - /* Descend the tree verifying hash pages */ + /* Descend the tree verifying hash blocks. */ for (; level > 0; level--) { - struct page *hpage = hpages[level - 1]; - unsigned int hoffset = hoffsets[level - 1]; - - err = fsverity_hash_page(params, inode, req, hpage, real_hash); + struct page *hpage = hblocks[level - 1].page; + unsigned long hblock_idx = hblocks[level - 1].index; + unsigned int hblock_offset_in_page = + hblocks[level - 1].offset_in_page; + unsigned int hoffset = hblocks[level - 1].hoffset; + + err = fsverity_hash_block(params, inode, req, hpage, + hblock_offset_in_page, real_hash); if (err) goto out; - err = cmp_hashes(vi, want_hash, real_hash, index, level - 1); + err = cmp_hashes(vi, want_hash, real_hash, data_pos, level - 1); if (err) goto out; - SetPageChecked(hpage); + /* + * Mark the hash block as verified. This must be atomic and + * idempotent, as the same hash block might be verified by + * multiple threads concurrently. + */ + if (vi->hash_block_verified) + set_bit(hblock_idx, vi->hash_block_verified); + else + SetPageChecked(hpage); memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); - pr_debug("Verified hash page at level %d, now want %s:%*phN\n", - level - 1, params->hash_alg->name, hsize, want_hash); } - /* Finally, verify the data page */ - err = fsverity_hash_page(params, inode, req, data_page, real_hash); + /* Finally, verify the data block. */ + err = fsverity_hash_block(params, inode, req, data_page, + dblock_offset_in_page, real_hash); if (err) goto out; - err = cmp_hashes(vi, want_hash, real_hash, index, -1); + err = cmp_hashes(vi, want_hash, real_hash, data_pos, -1); out: for (; level > 0; level--) - put_page(hpages[level - 1]); + put_page(hblocks[level - 1].page); return err == 0; } +static bool +verify_data_blocks(struct inode *inode, struct fsverity_info *vi, + struct ahash_request *req, struct folio *data_folio, + size_t len, size_t offset, unsigned long max_ra_pages) +{ + const unsigned int block_size = vi->tree_params.block_size; + u64 pos = (u64)data_folio->index << PAGE_SHIFT; + + if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size))) + return false; + if (WARN_ON_ONCE(!folio_test_locked(data_folio) || + folio_test_uptodate(data_folio))) + return false; + do { + struct page *data_page = + folio_page(data_folio, offset >> PAGE_SHIFT); + + if (!verify_data_block(inode, vi, req, data_page, pos + offset, + offset & ~PAGE_MASK, max_ra_pages)) + return false; + offset += block_size; + len -= block_size; + } while (len); + return true; +} + /** - * fsverity_verify_page() - verify a data page - * @page: the page to verity + * fsverity_verify_blocks() - verify data in a folio + * @folio: the folio containing the data to verify + * @len: the length of the data to verify in the folio + * @offset: the offset of the data to verify in the folio * - * Verify a page that has just been read from a verity file. The page must be a - * pagecache page that is still locked and not yet uptodate. + * Verify data that has just been read from a verity file. The data must be + * located in a pagecache folio that is still locked and not yet uptodate. The + * length and offset of the data must be Merkle tree block size aligned. * - * Return: true if the page is valid, else false. + * Return: %true if the data is valid, else %false. */ -bool fsverity_verify_page(struct page *page) +bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { - struct inode *inode = page->mapping->host; - const struct fsverity_info *vi = inode->i_verity_info; + struct inode *inode = folio->mapping->host; + struct fsverity_info *vi = inode->i_verity_info; struct ahash_request *req; bool valid; /* This allocation never fails, since it's mempool-backed. */ req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); - valid = verify_page(inode, vi, req, page, 0); + valid = verify_data_blocks(inode, vi, req, folio, len, offset, 0); fsverity_free_hash_request(vi->tree_params.hash_alg, req); return valid; } -EXPORT_SYMBOL_GPL(fsverity_verify_page); +EXPORT_SYMBOL_GPL(fsverity_verify_blocks); #ifdef CONFIG_BLOCK /** * fsverity_verify_bio() - verify a 'read' bio that has just completed * @bio: the bio to verify * - * Verify a set of pages that have just been read from a verity file. The pages - * must be pagecache pages that are still locked and not yet uptodate. If a - * page fails verification, then bio->bi_status is set to an error status. + * Verify the bio's data against the file's Merkle tree. All bio data segments + * must be aligned to the file's Merkle tree block size. If any data fails + * verification, then bio->bi_status is set to an error status. * * This is a helper function for use by the ->readahead() method of filesystems * that issue bios to read data directly into the page cache. Filesystems that @@ -212,15 +338,13 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); void fsverity_verify_bio(struct bio *bio) { struct inode *inode = bio_first_page_all(bio)->mapping->host; - const struct fsverity_info *vi = inode->i_verity_info; - const struct merkle_tree_params *params = &vi->tree_params; + struct fsverity_info *vi = inode->i_verity_info; struct ahash_request *req; - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; unsigned long max_ra_pages = 0; /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(params->hash_alg, GFP_NOFS); + req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); if (bio->bi_opf & REQ_RAHEAD) { /* @@ -232,24 +356,18 @@ void fsverity_verify_bio(struct bio *bio) * This improves sequential read performance, as it greatly * reduces the number of I/O requests made to the Merkle tree. */ - bio_for_each_segment_all(bv, bio, iter_all) - max_ra_pages++; - max_ra_pages /= 4; + max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2); } - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - unsigned long level0_index = page->index >> params->log_arity; - unsigned long level0_ra_pages = - min(max_ra_pages, params->level0_blocks - level0_index); - - if (!verify_page(inode, vi, req, page, level0_ra_pages)) { + bio_for_each_folio_all(fi, bio) { + if (!verify_data_blocks(inode, vi, req, fi.folio, fi.length, + fi.offset, max_ra_pages)) { bio->bi_status = BLK_STS_IOERR; break; } } - fsverity_free_hash_request(params->hash_alg, req); + fsverity_free_hash_request(vi->tree_params.hash_alg, req); } EXPORT_SYMBOL_GPL(fsverity_verify_bio); #endif /* CONFIG_BLOCK */ diff --git a/fs/xattr.c b/fs/xattr.c index adab9a70b536..14a7eb3c8fa8 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -9,6 +9,7 @@ Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> @@ -82,7 +83,7 @@ xattr_resolve_name(struct inode *inode, const char **name) /** * may_write_xattr - check whether inode allows writing xattr - * @mnt_userns: User namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @inode: the inode on which to set an xattr * * Check whether the inode allows writing xattrs. Specifically, we can never @@ -94,13 +95,13 @@ xattr_resolve_name(struct inode *inode, const char **name) * * Return: On success zero is returned. On error a negative errno is returned. */ -int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode) +int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode) { if (IS_IMMUTABLE(inode)) return -EPERM; if (IS_APPEND(inode)) return -EPERM; - if (HAS_UNMAPPED_ID(mnt_userns, inode)) + if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; return 0; } @@ -110,13 +111,13 @@ int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode) * because different namespaces have very different rules. */ static int -xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, +xattr_permission(struct mnt_idmap *idmap, struct inode *inode, const char *name, int mask) { if (mask & MAY_WRITE) { int ret; - ret = may_write_xattr(mnt_userns, inode); + ret = may_write_xattr(idmap, inode); if (ret) return ret; } @@ -148,11 +149,11 @@ xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && (mask & MAY_WRITE) && - !inode_owner_or_capable(mnt_userns, inode)) + !inode_owner_or_capable(idmap, inode)) return -EPERM; } - return inode_permission(mnt_userns, inode, mask); + return inode_permission(idmap, inode, mask); } /* @@ -183,7 +184,7 @@ xattr_supported_namespace(struct inode *inode, const char *prefix) EXPORT_SYMBOL(xattr_supported_namespace); int -__vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +__vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { @@ -199,7 +200,7 @@ __vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, return -EOPNOTSUPP; if (size == 0) value = ""; /* empty EA, do not remove */ - return handler->set(handler, mnt_userns, dentry, inode, name, value, + return handler->set(handler, idmap, dentry, inode, name, value, size, flags); } EXPORT_SYMBOL(__vfs_setxattr); @@ -208,7 +209,7 @@ EXPORT_SYMBOL(__vfs_setxattr); * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * - * @mnt_userns: user namespace of the mount the inode was found from + * @idmap: idmap of the mount the inode was found from * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to @@ -221,7 +222,7 @@ EXPORT_SYMBOL(__vfs_setxattr); * is executed. It also assumes that the caller will make the appropriate * permission checks. */ -int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, +int __vfs_setxattr_noperm(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { @@ -233,7 +234,7 @@ int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_opflags & IOP_XATTR) { - error = __vfs_setxattr(mnt_userns, dentry, inode, name, value, + error = __vfs_setxattr(idmap, dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); @@ -264,7 +265,7 @@ int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * - * @mnt_userns: user namespace of the mount of the target inode + * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to @@ -274,18 +275,18 @@ int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, * a delegation was broken on, NULL if none. */ int -__vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry, +__vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(mnt_userns, inode, name, MAY_WRITE); + error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; - error = security_inode_setxattr(mnt_userns, dentry, name, value, size, + error = security_inode_setxattr(idmap, dentry, name, value, size, flags); if (error) goto out; @@ -294,7 +295,7 @@ __vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) goto out; - error = __vfs_setxattr_noperm(mnt_userns, dentry, name, value, + error = __vfs_setxattr_noperm(idmap, dentry, name, value, size, flags); out: @@ -303,7 +304,7 @@ out: EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); int -vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; @@ -312,7 +313,7 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(mnt_userns, dentry, &value, size); + error = cap_convert_nscap(idmap, dentry, &value, size); if (error < 0) return error; size = error; @@ -320,7 +321,7 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, retry_deleg: inode_lock(inode); - error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, + error = __vfs_setxattr_locked(idmap, dentry, name, value, size, flags, &delegated_inode); inode_unlock(inode); @@ -337,19 +338,19 @@ retry_deleg: EXPORT_SYMBOL_GPL(vfs_setxattr); static ssize_t -xattr_getsecurity(struct user_namespace *mnt_userns, struct inode *inode, +xattr_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void *value, size_t size) { void *buffer = NULL; ssize_t len; if (!value || !size) { - len = security_inode_getsecurity(mnt_userns, inode, name, + len = security_inode_getsecurity(idmap, inode, name, &buffer, false); goto out_noalloc; } - len = security_inode_getsecurity(mnt_userns, inode, name, &buffer, + len = security_inode_getsecurity(idmap, inode, name, &buffer, true); if (len < 0) return len; @@ -374,7 +375,7 @@ out_noalloc: * Returns the result of alloc, if failed, or the getxattr operation. */ int -vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry, +vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) { @@ -383,7 +384,7 @@ vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry, char *value = *xattr_value; int error; - error = xattr_permission(mnt_userns, inode, name, MAY_READ); + error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; @@ -427,13 +428,13 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, EXPORT_SYMBOL(__vfs_getxattr); ssize_t -vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(mnt_userns, inode, name, MAY_READ); + error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; @@ -444,7 +445,7 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - int ret = xattr_getsecurity(mnt_userns, inode, suffix, value, + int ret = xattr_getsecurity(idmap, inode, suffix, value, size); /* * Only overwrite the return value if a security module @@ -480,7 +481,7 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size) EXPORT_SYMBOL_GPL(vfs_listxattr); int -__vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, +__vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = d_inode(dentry); @@ -494,7 +495,7 @@ __vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; - return handler->set(handler, mnt_userns, dentry, inode, name, NULL, 0, + return handler->set(handler, idmap, dentry, inode, name, NULL, 0, XATTR_REPLACE); } EXPORT_SYMBOL(__vfs_removexattr); @@ -503,25 +504,25 @@ EXPORT_SYMBOL(__vfs_removexattr); * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * - * @mnt_userns: user namespace of the mount of the target inode + * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: name of xattr to remove * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int -__vfs_removexattr_locked(struct user_namespace *mnt_userns, +__vfs_removexattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(mnt_userns, inode, name, MAY_WRITE); + error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; - error = security_inode_removexattr(mnt_userns, dentry, name); + error = security_inode_removexattr(idmap, dentry, name); if (error) goto out; @@ -529,7 +530,7 @@ __vfs_removexattr_locked(struct user_namespace *mnt_userns, if (error) goto out; - error = __vfs_removexattr(mnt_userns, dentry, name); + error = __vfs_removexattr(idmap, dentry, name); if (!error) { fsnotify_xattr(dentry); @@ -542,7 +543,7 @@ out: EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); int -vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, +vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; @@ -551,7 +552,7 @@ vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, retry_deleg: inode_lock(inode); - error = __vfs_removexattr_locked(mnt_userns, dentry, + error = __vfs_removexattr_locked(idmap, dentry, name, &delegated_inode); inode_unlock(inode); @@ -605,7 +606,7 @@ int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, return do_set_acl(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size); - return vfs_setxattr(mnt_idmap_owner(idmap), dentry, ctx->kname->name, + return vfs_setxattr(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } @@ -714,8 +715,7 @@ do_getxattr(struct mnt_idmap *idmap, struct dentry *d, if (is_posix_acl_xattr(ctx->kname->name)) error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size); else - error = vfs_getxattr(mnt_idmap_owner(idmap), d, kname, - ctx->kvalue, ctx->size); + error = vfs_getxattr(idmap, d, kname, ctx->kvalue, ctx->size); if (error > 0) { if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; @@ -892,9 +892,9 @@ removexattr(struct mnt_idmap *idmap, struct dentry *d, return error; if (is_posix_acl_xattr(kname)) - return vfs_remove_acl(mnt_idmap_owner(idmap), d, kname); + return vfs_remove_acl(idmap, d, kname); - return vfs_removexattr(mnt_idmap_owner(idmap), d, kname); + return vfs_removexattr(idmap, d, kname); } static int path_removexattr(const char __user *pathname, diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index bb0c700afe3c..86696a1c6891 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -44,16 +44,15 @@ xfs_perag_get( xfs_agnumber_t agno) { struct xfs_perag *pag; - int ref = 0; rcu_read_lock(); pag = radix_tree_lookup(&mp->m_perag_tree, agno); if (pag) { + trace_xfs_perag_get(pag, _RET_IP_); ASSERT(atomic_read(&pag->pag_ref) >= 0); - ref = atomic_inc_return(&pag->pag_ref); + atomic_inc(&pag->pag_ref); } rcu_read_unlock(); - trace_xfs_perag_get(mp, agno, ref, _RET_IP_); return pag; } @@ -68,7 +67,6 @@ xfs_perag_get_tag( { struct xfs_perag *pag; int found; - int ref; rcu_read_lock(); found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, @@ -77,9 +75,9 @@ xfs_perag_get_tag( rcu_read_unlock(); return NULL; } - ref = atomic_inc_return(&pag->pag_ref); + trace_xfs_perag_get_tag(pag, _RET_IP_); + atomic_inc(&pag->pag_ref); rcu_read_unlock(); - trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); return pag; } @@ -87,11 +85,68 @@ void xfs_perag_put( struct xfs_perag *pag) { - int ref; - + trace_xfs_perag_put(pag, _RET_IP_); ASSERT(atomic_read(&pag->pag_ref) > 0); - ref = atomic_dec_return(&pag->pag_ref); - trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); + atomic_dec(&pag->pag_ref); +} + +/* + * Active references for perag structures. This is for short term access to the + * per ag structures for walking trees or accessing state. If an AG is being + * shrunk or is offline, then this will fail to find that AG and return NULL + * instead. + */ +struct xfs_perag * +xfs_perag_grab( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + rcu_read_lock(); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + trace_xfs_perag_grab(pag, _RET_IP_); + if (!atomic_inc_not_zero(&pag->pag_active_ref)) + pag = NULL; + } + rcu_read_unlock(); + return pag; +} + +/* + * search from @first to find the next perag with the given tag set. + */ +struct xfs_perag * +xfs_perag_grab_tag( + struct xfs_mount *mp, + xfs_agnumber_t first, + int tag) +{ + struct xfs_perag *pag; + int found; + + rcu_read_lock(); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, first, 1, tag); + if (found <= 0) { + rcu_read_unlock(); + return NULL; + } + trace_xfs_perag_grab_tag(pag, _RET_IP_); + if (!atomic_inc_not_zero(&pag->pag_active_ref)) + pag = NULL; + rcu_read_unlock(); + return pag; +} + +void +xfs_perag_rele( + struct xfs_perag *pag) +{ + trace_xfs_perag_rele(pag, _RET_IP_); + if (atomic_dec_and_test(&pag->pag_active_ref)) + wake_up(&pag->pag_active_wq); } /* @@ -196,6 +251,10 @@ xfs_free_perag( cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_buf_hash_destroy(pag); + /* drop the mount's active reference */ + xfs_perag_rele(pag); + XFS_IS_CORRUPT(pag->pag_mount, + atomic_read(&pag->pag_active_ref) != 0); call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -314,6 +373,7 @@ xfs_initialize_perag( INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); init_waitqueue_head(&pag->pagb_wait); + init_waitqueue_head(&pag->pag_active_wq); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; #endif /* __KERNEL__ */ @@ -322,6 +382,9 @@ xfs_initialize_perag( if (error) goto out_remove_pag; + /* Active ref owned by mount indicates AG is online. */ + atomic_set(&pag->pag_active_ref, 1); + /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; @@ -824,7 +887,7 @@ xfs_ag_shrink_space( struct xfs_alloc_arg args = { .tp = *tpp, .mp = mp, - .type = XFS_ALLOCTYPE_THIS_BNO, + .pag = pag, .minlen = delta, .maxlen = delta, .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, @@ -856,14 +919,11 @@ xfs_ag_shrink_space( if (delta >= aglen) return -EINVAL; - args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta); - /* * Make sure that the last inode cluster cannot overlap with the new * end of the AG, even if it's sparse. */ - error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp, - aglen - delta); + error = xfs_ialloc_check_shrink(pag, *tpp, agibp, aglen - delta); if (error) return error; @@ -876,7 +936,8 @@ xfs_ag_shrink_space( return error; /* internal log shouldn't also show up in the free space btrees */ - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_exact_bno(&args, + XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta)); if (!error && args.agbno == NULLAGBLOCK) error = -ENOSPC; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 191b22b9a35b..5e18536dfdce 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -32,14 +32,12 @@ struct xfs_ag_resv { struct xfs_perag { struct xfs_mount *pag_mount; /* owner filesystem */ xfs_agnumber_t pag_agno; /* AG this structure belongs to */ - atomic_t pag_ref; /* perag reference count */ - char pagf_init; /* this agf's entry is initialized */ - char pagi_init; /* this agi's entry is initialized */ - char pagf_metadata; /* the agf is preferred to be metadata */ - char pagi_inodeok; /* The agi is ok for inodes */ + atomic_t pag_ref; /* passive reference count */ + atomic_t pag_active_ref; /* active reference count */ + wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */ + unsigned long pag_opstate; uint8_t pagf_levels[XFS_BTNUM_AGF]; /* # of levels in bno & cnt btree */ - bool pagf_agflreset; /* agfl requires reset before use */ uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ @@ -106,16 +104,44 @@ struct xfs_perag { #endif /* __KERNEL__ */ }; +/* + * Per-AG operational state. These are atomic flag bits. + */ +#define XFS_AGSTATE_AGF_INIT 0 +#define XFS_AGSTATE_AGI_INIT 1 +#define XFS_AGSTATE_PREFERS_METADATA 2 +#define XFS_AGSTATE_ALLOWS_INODES 3 +#define XFS_AGSTATE_AGFL_NEEDS_RESET 4 + +#define __XFS_AG_OPSTATE(name, NAME) \ +static inline bool xfs_perag_ ## name (struct xfs_perag *pag) \ +{ \ + return test_bit(XFS_AGSTATE_ ## NAME, &pag->pag_opstate); \ +} + +__XFS_AG_OPSTATE(initialised_agf, AGF_INIT) +__XFS_AG_OPSTATE(initialised_agi, AGI_INIT) +__XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA) +__XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES) +__XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET) + int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); void xfs_free_perag(struct xfs_mount *mp); +/* Passive AG references */ struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int tag); void xfs_perag_put(struct xfs_perag *pag); +/* Active AG references */ +struct xfs_perag *xfs_perag_grab(struct xfs_mount *, xfs_agnumber_t); +struct xfs_perag *xfs_perag_grab_tag(struct xfs_mount *, xfs_agnumber_t, + int tag); +void xfs_perag_rele(struct xfs_perag *pag); + /* * Per-ag geometry infomation and validation */ @@ -193,31 +219,86 @@ xfs_perag_next( struct xfs_mount *mp = pag->pag_mount; *agno = pag->pag_agno + 1; - xfs_perag_put(pag); - if (*agno > end_agno) - return NULL; - return xfs_perag_get(mp, *agno); + xfs_perag_rele(pag); + while (*agno <= end_agno) { + pag = xfs_perag_grab(mp, *agno); + if (pag) + return pag; + (*agno)++; + } + return NULL; } #define for_each_perag_range(mp, agno, end_agno, pag) \ - for ((pag) = xfs_perag_get((mp), (agno)); \ + for ((pag) = xfs_perag_grab((mp), (agno)); \ (pag) != NULL; \ (pag) = xfs_perag_next((pag), &(agno), (end_agno))) #define for_each_perag_from(mp, agno, pag) \ for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag)) - #define for_each_perag(mp, agno, pag) \ (agno) = 0; \ for_each_perag_from((mp), (agno), (pag)) #define for_each_perag_tag(mp, agno, pag, tag) \ - for ((agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ + for ((agno) = 0, (pag) = xfs_perag_grab_tag((mp), 0, (tag)); \ (pag) != NULL; \ (agno) = (pag)->pag_agno + 1, \ - xfs_perag_put(pag), \ - (pag) = xfs_perag_get_tag((mp), (agno), (tag))) + xfs_perag_rele(pag), \ + (pag) = xfs_perag_grab_tag((mp), (agno), (tag))) + +static inline struct xfs_perag * +xfs_perag_next_wrap( + struct xfs_perag *pag, + xfs_agnumber_t *agno, + xfs_agnumber_t stop_agno, + xfs_agnumber_t restart_agno, + xfs_agnumber_t wrap_agno) +{ + struct xfs_mount *mp = pag->pag_mount; + + *agno = pag->pag_agno + 1; + xfs_perag_rele(pag); + while (*agno != stop_agno) { + if (*agno >= wrap_agno) { + if (restart_agno >= stop_agno) + break; + *agno = restart_agno; + } + + pag = xfs_perag_grab(mp, *agno); + if (pag) + return pag; + (*agno)++; + } + return NULL; +} + +/* + * Iterate all AGs from start_agno through wrap_agno, then restart_agno through + * (start_agno - 1). + */ +#define for_each_perag_wrap_range(mp, start_agno, restart_agno, wrap_agno, agno, pag) \ + for ((agno) = (start_agno), (pag) = xfs_perag_grab((mp), (agno)); \ + (pag) != NULL; \ + (pag) = xfs_perag_next_wrap((pag), &(agno), (start_agno), \ + (restart_agno), (wrap_agno))) +/* + * Iterate all AGs from start_agno through wrap_agno, then 0 through + * (start_agno - 1). + */ +#define for_each_perag_wrap_at(mp, start_agno, wrap_agno, agno, pag) \ + for_each_perag_wrap_range((mp), (start_agno), 0, (wrap_agno), (agno), (pag)) + +/* + * Iterate all AGs from start_agno through to the end of the filesystem, then 0 + * through (start_agno - 1). + */ +#define for_each_perag_wrap(mp, start_agno, agno, pag) \ + for_each_perag_wrap_at((mp), (start_agno), (mp)->m_sb.sb_agcount, \ + (agno), (pag)) + struct aghdr_init_data { /* per ag data */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 5af123d13a63..7fd1fea95552 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -264,7 +264,7 @@ xfs_ag_resv_init( if (error) goto out; - error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used); + error = xfs_finobt_calc_reserves(pag, tp, &ask, &used); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 989cf341779b..6a037173d20d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -36,10 +36,6 @@ struct workqueue_struct *xfs_alloc_wq; #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); -STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); -STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); - /* * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in * the beginning of the block for a proper header with the location information @@ -772,8 +768,6 @@ xfs_alloc_cur_setup( int error; int i; - ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO); - acur->cur_len = args->maxlen; acur->rec_bno = 0; acur->rec_len = 0; @@ -887,7 +881,6 @@ xfs_alloc_cur_check( * We have an aligned record that satisfies minlen and beats or matches * the candidate extent size. Compare locality for near allocation mode. */ - ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO); diff = xfs_alloc_compute_diff(args->agbno, args->len, args->alignment, args->datatype, bnoa, lena, &bnew); @@ -1133,78 +1126,6 @@ error: } /* - * Allocate a variable extent in the allocation group agno. - * Type and bno are used to determine where in the allocation group the - * extent will start. - * Extent's length (returned in *len) will be between minlen and maxlen, - * and of the form k * prod + mod unless there's nothing that large. - * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. - */ -STATIC int /* error */ -xfs_alloc_ag_vextent( - xfs_alloc_arg_t *args) /* argument structure for allocation */ -{ - int error=0; - - ASSERT(args->minlen > 0); - ASSERT(args->maxlen > 0); - ASSERT(args->minlen <= args->maxlen); - ASSERT(args->mod < args->prod); - ASSERT(args->alignment > 0); - - /* - * Branch to correct routine based on the type. - */ - args->wasfromfl = 0; - switch (args->type) { - case XFS_ALLOCTYPE_THIS_AG: - error = xfs_alloc_ag_vextent_size(args); - break; - case XFS_ALLOCTYPE_NEAR_BNO: - error = xfs_alloc_ag_vextent_near(args); - break; - case XFS_ALLOCTYPE_THIS_BNO: - error = xfs_alloc_ag_vextent_exact(args); - break; - default: - ASSERT(0); - /* NOTREACHED */ - } - - if (error || args->agbno == NULLAGBLOCK) - return error; - - ASSERT(args->len >= args->minlen); - ASSERT(args->len <= args->maxlen); - ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL); - ASSERT(args->agbno % args->alignment == 0); - - /* if not file data, insert new block into the reverse map btree */ - if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { - error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, - args->agbno, args->len, &args->oinfo); - if (error) - return error; - } - - if (!args->wasfromfl) { - error = xfs_alloc_update_counters(args->tp, args->agbp, - -((long)(args->len))); - if (error) - return error; - - ASSERT(!xfs_extent_busy_search(args->mp, args->pag, - args->agbno, args->len)); - } - - xfs_ag_resv_alloc_extent(args->pag, args->resv, args); - - XFS_STATS_INC(args->mp, xs_allocx); - XFS_STATS_ADD(args->mp, xs_allocb, args->len); - return error; -} - -/* * Allocate a variable extent at exactly agno/bno. * Extent's length (returned in *len) will be between minlen and maxlen, * and of the form k * prod + mod unless there's nothing that large. @@ -1389,7 +1310,6 @@ xfs_alloc_ag_vextent_locality( bool fbinc; ASSERT(acur->len == 0); - ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO); *stat = 0; @@ -2435,7 +2355,7 @@ xfs_agfl_reset( struct xfs_mount *mp = tp->t_mountp; struct xfs_agf *agf = agbp->b_addr; - ASSERT(pag->pagf_agflreset); + ASSERT(xfs_perag_agfl_needs_reset(pag)); trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); xfs_warn(mp, @@ -2450,7 +2370,7 @@ xfs_agfl_reset( XFS_AGF_FLCOUNT); pag->pagf_flcount = 0; - pag->pagf_agflreset = false; + clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); } /* @@ -2472,20 +2392,20 @@ xfs_defer_agfl_block( struct xfs_owner_info *oinfo) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent_free_item *new; /* new element */ + struct xfs_extent_free_item *xefi; ASSERT(xfs_extfree_item_cache != NULL); ASSERT(oinfo != NULL); - new = kmem_cache_zalloc(xfs_extfree_item_cache, + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); - new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); - new->xefi_blockcount = 1; - new->xefi_owner = oinfo->oi_owner; + xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); + xefi->xefi_blockcount = 1; + xefi->xefi_owner = oinfo->oi_owner; trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); } /* @@ -2500,7 +2420,7 @@ __xfs_free_extent_later( const struct xfs_owner_info *oinfo, bool skip_discard) { - struct xfs_extent_free_item *new; /* new element */ + struct xfs_extent_free_item *xefi; #ifdef DEBUG struct xfs_mount *mp = tp->t_mountp; xfs_agnumber_t agno; @@ -2519,27 +2439,27 @@ __xfs_free_extent_later( #endif ASSERT(xfs_extfree_item_cache != NULL); - new = kmem_cache_zalloc(xfs_extfree_item_cache, + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); - new->xefi_startblock = bno; - new->xefi_blockcount = (xfs_extlen_t)len; + xefi->xefi_startblock = bno; + xefi->xefi_blockcount = (xfs_extlen_t)len; if (skip_discard) - new->xefi_flags |= XFS_EFI_SKIP_DISCARD; + xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; if (oinfo) { ASSERT(oinfo->oi_offset == 0); if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) - new->xefi_flags |= XFS_EFI_ATTR_FORK; + xefi->xefi_flags |= XFS_EFI_ATTR_FORK; if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) - new->xefi_flags |= XFS_EFI_BMBT_BLOCK; - new->xefi_owner = oinfo->oi_owner; + xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK; + xefi->xefi_owner = oinfo->oi_owner; } else { - new->xefi_owner = XFS_RMAP_OWN_NULL; + xefi->xefi_owner = XFS_RMAP_OWN_NULL; } trace_xfs_bmap_free_defer(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); } #ifdef DEBUG @@ -2605,7 +2525,7 @@ xfs_alloc_fix_freelist( /* deferred ops (AGFL block frees) require permanent transactions */ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - if (!pag->pagf_init) { + if (!xfs_perag_initialised_agf(pag)) { error = xfs_alloc_read_agf(pag, tp, flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ @@ -2620,7 +2540,8 @@ xfs_alloc_fix_freelist( * somewhere else if we are not being asked to try harder at this * point */ - if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) && + if (xfs_perag_prefers_metadata(pag) && + (args->datatype & XFS_ALLOC_USERDATA) && (flags & XFS_ALLOC_FLAG_TRYLOCK)) { ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); goto out_agbp_relse; @@ -2646,7 +2567,7 @@ xfs_alloc_fix_freelist( } /* reset a padding mismatched agfl before final free space check */ - if (pag->pagf_agflreset) + if (xfs_perag_agfl_needs_reset(pag)) xfs_agfl_reset(tp, agbp, pag); /* If there isn't enough total space or single-extent, reject it. */ @@ -2707,7 +2628,6 @@ xfs_alloc_fix_freelist( targs.agbp = agbp; targs.agno = args->agno; targs.alignment = targs.minlen = targs.prod = 1; - targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) @@ -2720,7 +2640,7 @@ xfs_alloc_fix_freelist( targs.resv = XFS_AG_RESV_AGFL; /* Allocate as many blocks as possible at once. */ - error = xfs_alloc_ag_vextent(&targs); + error = xfs_alloc_ag_vextent_size(&targs); if (error) goto out_agflbp_relse; @@ -2734,6 +2654,18 @@ xfs_alloc_fix_freelist( break; goto out_agflbp_relse; } + + if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) { + error = xfs_rmap_alloc(tp, agbp, pag, + targs.agbno, targs.len, &targs.oinfo); + if (error) + goto out_agflbp_relse; + } + error = xfs_alloc_update_counters(tp, agbp, + -((long)(targs.len))); + if (error) + goto out_agflbp_relse; + /* * Put each allocated block on the list. */ @@ -2803,7 +2735,7 @@ xfs_alloc_get_freelist( if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) agf->agf_flfirst = 0; - ASSERT(!pag->pagf_agflreset); + ASSERT(!xfs_perag_agfl_needs_reset(pag)); be32_add_cpu(&agf->agf_flcount, -1); pag->pagf_flcount--; @@ -2892,7 +2824,7 @@ xfs_alloc_put_freelist( if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; - ASSERT(!pag->pagf_agflreset); + ASSERT(!xfs_perag_agfl_needs_reset(pag)); be32_add_cpu(&agf->agf_flcount, 1); pag->pagf_flcount++; @@ -3099,7 +3031,7 @@ xfs_alloc_read_agf( return error; agf = agfbp->b_addr; - if (!pag->pagf_init) { + if (!xfs_perag_initialised_agf(pag)) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); @@ -3111,8 +3043,8 @@ xfs_alloc_read_agf( pag->pagf_levels[XFS_BTNUM_RMAPi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); - pag->pagf_init = 1; - pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf); + if (xfs_agfl_needs_reset(pag->pag_mount, agf)) + set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); /* * Update the in-core allocbt counter. Filter out the rmapbt @@ -3127,6 +3059,8 @@ xfs_alloc_read_agf( if (allocbt_blks > 0) atomic64_add(allocbt_blks, &pag->pag_mount->m_allocbt_blks); + + set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); } #ifdef DEBUG else if (!xfs_is_shutdown(pag->pag_mount)) { @@ -3148,26 +3082,25 @@ xfs_alloc_read_agf( } /* - * Allocate an extent (variable-size). - * Depending on the allocation type, we either look in a single allocation - * group or loop over the allocation groups to find the result. + * Pre-proces allocation arguments to set initial state that we don't require + * callers to set up correctly, as well as bounds check the allocation args + * that are set up. */ -int /* error */ -xfs_alloc_vextent( - struct xfs_alloc_arg *args) /* allocation argument structure */ +static int +xfs_alloc_vextent_check_args( + struct xfs_alloc_arg *args, + xfs_fsblock_t target, + xfs_agnumber_t *minimum_agno) { - xfs_agblock_t agsize; /* allocation group size */ - int error; - int flags; /* XFS_ALLOC_FLAG_... locking flags */ - struct xfs_mount *mp; /* mount structure pointer */ - xfs_agnumber_t sagno; /* starting allocation group number */ - xfs_alloctype_t type; /* input allocation type */ - int bump_rotor = 0; - xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ - - mp = args->mp; - type = args->otype = args->type; - args->agbno = NULLAGBLOCK; + struct xfs_mount *mp = args->mp; + xfs_agblock_t agsize; + + args->fsbno = NULLFSBLOCK; + + *minimum_agno = 0; + if (args->tp->t_highest_agno != NULLAGNUMBER) + *minimum_agno = args->tp->t_highest_agno; + /* * Just fix this up, for the case where the last a.g. is shorter * (or there's only one a.g.) and the caller couldn't easily figure @@ -3178,168 +3111,414 @@ xfs_alloc_vextent( args->maxlen = agsize; if (args->alignment == 0) args->alignment = 1; - ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize); + + ASSERT(args->minlen > 0); + ASSERT(args->maxlen > 0); + ASSERT(args->alignment > 0); + ASSERT(args->resv != XFS_AG_RESV_AGFL); + + ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize); ASSERT(args->minlen <= args->maxlen); ASSERT(args->minlen <= agsize); ASSERT(args->mod < args->prod); - if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount || - XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize || + + if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount || + XFS_FSB_TO_AGBNO(mp, target) >= agsize || args->minlen > args->maxlen || args->minlen > agsize || args->mod >= args->prod) { - args->fsbno = NULLFSBLOCK; trace_xfs_alloc_vextent_badargs(args); + return -ENOSPC; + } + + if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) { + trace_xfs_alloc_vextent_skip_deadlock(args); + return -ENOSPC; + } + return 0; + +} + +/* + * Prepare an AG for allocation. If the AG is not prepared to accept the + * allocation, return failure. + * + * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are + * modified to hold their own perag references. + */ +static int +xfs_alloc_vextent_prepare_ag( + struct xfs_alloc_arg *args) +{ + bool need_pag = !args->pag; + int error; + + if (need_pag) + args->pag = xfs_perag_get(args->mp, args->agno); + + args->agbp = NULL; + error = xfs_alloc_fix_freelist(args, 0); + if (error) { + trace_xfs_alloc_vextent_nofix(args); + if (need_pag) + xfs_perag_put(args->pag); + args->agbno = NULLAGBLOCK; + return error; + } + if (!args->agbp) { + /* cannot allocate in this AG at all */ + trace_xfs_alloc_vextent_noagbp(args); + args->agbno = NULLAGBLOCK; return 0; } + args->wasfromfl = 0; + return 0; +} - switch (type) { - case XFS_ALLOCTYPE_THIS_AG: - case XFS_ALLOCTYPE_NEAR_BNO: - case XFS_ALLOCTYPE_THIS_BNO: - /* - * These three force us into a single a.g. - */ - args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); - args->pag = xfs_perag_get(mp, args->agno); - error = xfs_alloc_fix_freelist(args, 0); - if (error) { - trace_xfs_alloc_vextent_nofix(args); - goto error0; - } - if (!args->agbp) { - trace_xfs_alloc_vextent_noagbp(args); +/* + * Post-process allocation results to account for the allocation if it succeed + * and set the allocated block number correctly for the caller. + * + * XXX: we should really be returning ENOSPC for ENOSPC, not + * hiding it behind a "successful" NULLFSBLOCK allocation. + */ +static int +xfs_alloc_vextent_finish( + struct xfs_alloc_arg *args, + xfs_agnumber_t minimum_agno, + int alloc_error, + bool drop_perag) +{ + struct xfs_mount *mp = args->mp; + int error = 0; + + /* + * We can end up here with a locked AGF. If we failed, the caller is + * likely going to try to allocate again with different parameters, and + * that can widen the AGs that are searched for free space. If we have + * to do BMBT block allocation, we have to do a new allocation. + * + * Hence leaving this function with the AGF locked opens up potential + * ABBA AGF deadlocks because a future allocation attempt in this + * transaction may attempt to lock a lower number AGF. + * + * We can't release the AGF until the transaction is commited, so at + * this point we must update the "first allocation" tracker to point at + * this AG if the tracker is empty or points to a lower AG. This allows + * the next allocation attempt to be modified appropriately to avoid + * deadlocks. + */ + if (args->agbp && + (args->tp->t_highest_agno == NULLAGNUMBER || + args->agno > minimum_agno)) + args->tp->t_highest_agno = args->agno; + + /* + * If the allocation failed with an error or we had an ENOSPC result, + * preserve the returned error whilst also marking the allocation result + * as "no extent allocated". This ensures that callers that fail to + * capture the error will still treat it as a failed allocation. + */ + if (alloc_error || args->agbno == NULLAGBLOCK) { + args->fsbno = NULLFSBLOCK; + error = alloc_error; + goto out_drop_perag; + } + + args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); + + ASSERT(args->len >= args->minlen); + ASSERT(args->len <= args->maxlen); + ASSERT(args->agbno % args->alignment == 0); + XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len); + + /* if not file data, insert new block into the reverse map btree */ + if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { + error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, + args->agbno, args->len, &args->oinfo); + if (error) + goto out_drop_perag; + } + + if (!args->wasfromfl) { + error = xfs_alloc_update_counters(args->tp, args->agbp, + -((long)(args->len))); + if (error) + goto out_drop_perag; + + ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno, + args->len)); + } + + xfs_ag_resv_alloc_extent(args->pag, args->resv, args); + + XFS_STATS_INC(mp, xs_allocx); + XFS_STATS_ADD(mp, xs_allocb, args->len); + +out_drop_perag: + if (drop_perag && args->pag) { + xfs_perag_rele(args->pag); + args->pag = NULL; + } + return error; +} + +/* + * Allocate within a single AG only. This uses a best-fit length algorithm so if + * you need an exact sized allocation without locality constraints, this is the + * fastest way to do it. + * + * Caller is expected to hold a perag reference in args->pag. + */ +int +xfs_alloc_vextent_this_ag( + struct xfs_alloc_arg *args, + xfs_agnumber_t agno) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + int error; + + args->agno = agno; + args->agbno = 0; + error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0), + &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + error = xfs_alloc_vextent_prepare_ag(args); + if (!error && args->agbp) + error = xfs_alloc_ag_vextent_size(args); + + return xfs_alloc_vextent_finish(args, minimum_agno, error, false); +} + +/* + * Iterate all AGs trying to allocate an extent starting from @start_ag. + * + * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the + * allocation attempts in @start_agno have locality information. If we fail to + * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs + * we attempt to allocation in as there is no locality optimisation possible for + * those allocations. + * + * On return, args->pag may be left referenced if we finish before the "all + * failed" return point. The allocation finish still needs the perag, and + * so the caller will release it once they've finished the allocation. + * + * When we wrap the AG iteration at the end of the filesystem, we have to be + * careful not to wrap into AGs below ones we already have locked in the + * transaction if we are doing a blocking iteration. This will result in an + * out-of-order locking of AGFs and hence can cause deadlocks. + */ +static int +xfs_alloc_vextent_iterate_ags( + struct xfs_alloc_arg *args, + xfs_agnumber_t minimum_agno, + xfs_agnumber_t start_agno, + xfs_agblock_t target_agbno, + uint32_t flags) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t agno; + int error = 0; + +restart: + for_each_perag_wrap_range(mp, start_agno, minimum_agno, + mp->m_sb.sb_agcount, agno, args->pag) { + args->agno = agno; + error = xfs_alloc_vextent_prepare_ag(args); + if (error) break; + if (!args->agbp) { + trace_xfs_alloc_vextent_loopfailed(args); + continue; } - args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); - if ((error = xfs_alloc_ag_vextent(args))) - goto error0; - break; - case XFS_ALLOCTYPE_START_BNO: - /* - * Try near allocation first, then anywhere-in-ag after - * the first a.g. fails. - */ - if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && - xfs_is_inode32(mp)) { - args->fsbno = XFS_AGB_TO_FSB(mp, - ((mp->m_agfrotor / rotorstep) % - mp->m_sb.sb_agcount), 0); - bump_rotor = 1; - } - args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); - args->type = XFS_ALLOCTYPE_NEAR_BNO; - fallthrough; - case XFS_ALLOCTYPE_FIRST_AG: + /* - * Rotate through the allocation groups looking for a winner. + * Allocation is supposed to succeed now, so break out of the + * loop regardless of whether we succeed or not. */ - if (type == XFS_ALLOCTYPE_FIRST_AG) { - /* - * Start with allocation group given by bno. - */ - args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); - args->type = XFS_ALLOCTYPE_THIS_AG; - sagno = 0; - flags = 0; + if (args->agno == start_agno && target_agbno) { + args->agbno = target_agbno; + error = xfs_alloc_ag_vextent_near(args); } else { - /* - * Start with the given allocation group. - */ - args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno); - flags = XFS_ALLOC_FLAG_TRYLOCK; - } - /* - * Loop over allocation groups twice; first time with - * trylock set, second time without. - */ - for (;;) { - args->pag = xfs_perag_get(mp, args->agno); - error = xfs_alloc_fix_freelist(args, flags); - if (error) { - trace_xfs_alloc_vextent_nofix(args); - goto error0; - } - /* - * If we get a buffer back then the allocation will fly. - */ - if (args->agbp) { - if ((error = xfs_alloc_ag_vextent(args))) - goto error0; - break; - } - - trace_xfs_alloc_vextent_loopfailed(args); - - /* - * Didn't work, figure out the next iteration. - */ - if (args->agno == sagno && - type == XFS_ALLOCTYPE_START_BNO) - args->type = XFS_ALLOCTYPE_THIS_AG; - /* - * For the first allocation, we can try any AG to get - * space. However, if we already have allocated a - * block, we don't want to try AGs whose number is below - * sagno. Otherwise, we may end up with out-of-order - * locking of AGF, which might cause deadlock. - */ - if (++(args->agno) == mp->m_sb.sb_agcount) { - if (args->tp->t_firstblock != NULLFSBLOCK) - args->agno = sagno; - else - args->agno = 0; - } - /* - * Reached the starting a.g., must either be done - * or switch to non-trylock mode. - */ - if (args->agno == sagno) { - if (flags == 0) { - args->agbno = NULLAGBLOCK; - trace_xfs_alloc_vextent_allfailed(args); - break; - } - - flags = 0; - if (type == XFS_ALLOCTYPE_START_BNO) { - args->agbno = XFS_FSB_TO_AGBNO(mp, - args->fsbno); - args->type = XFS_ALLOCTYPE_NEAR_BNO; - } - } - xfs_perag_put(args->pag); - } - if (bump_rotor) { - if (args->agno == sagno) - mp->m_agfrotor = (mp->m_agfrotor + 1) % - (mp->m_sb.sb_agcount * rotorstep); - else - mp->m_agfrotor = (args->agno * rotorstep + 1) % - (mp->m_sb.sb_agcount * rotorstep); + args->agbno = 0; + error = xfs_alloc_ag_vextent_size(args); } break; - default: - ASSERT(0); - /* NOTREACHED */ } - if (args->agbno == NULLAGBLOCK) - args->fsbno = NULLFSBLOCK; - else { - args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); -#ifdef DEBUG - ASSERT(args->len >= args->minlen); - ASSERT(args->len <= args->maxlen); - ASSERT(args->agbno % args->alignment == 0); - XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), - args->len); -#endif + if (error) { + xfs_perag_rele(args->pag); + args->pag = NULL; + return error; + } + if (args->agbp) + return 0; + /* + * We didn't find an AG we can alloation from. If we were given + * constraining flags by the caller, drop them and retry the allocation + * without any constraints being set. + */ + if (flags) { + flags = 0; + goto restart; } - xfs_perag_put(args->pag); + + ASSERT(args->pag == NULL); + trace_xfs_alloc_vextent_allfailed(args); return 0; -error0: - xfs_perag_put(args->pag); - return error; +} + +/* + * Iterate from the AGs from the start AG to the end of the filesystem, trying + * to allocate blocks. It starts with a near allocation attempt in the initial + * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap + * back to zero if allowed by previous allocations in this transaction, + * otherwise will wrap back to the start AG and run a second blocking pass to + * the end of the filesystem. + */ +int +xfs_alloc_vextent_start_ag( + struct xfs_alloc_arg *args, + xfs_fsblock_t target) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + xfs_agnumber_t start_agno; + xfs_agnumber_t rotorstep = xfs_rotorstep; + bool bump_rotor = false; + int error; + + args->agno = NULLAGNUMBER; + args->agbno = NULLAGBLOCK; + error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && + xfs_is_inode32(mp)) { + target = XFS_AGB_TO_FSB(mp, + ((mp->m_agfrotor / rotorstep) % + mp->m_sb.sb_agcount), 0); + bump_rotor = 1; + } + + start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); + error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, + XFS_FSB_TO_AGBNO(mp, target), XFS_ALLOC_FLAG_TRYLOCK); + + if (bump_rotor) { + if (args->agno == start_agno) + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + else + mp->m_agfrotor = (args->agno * rotorstep + 1) % + (mp->m_sb.sb_agcount * rotorstep); + } + + return xfs_alloc_vextent_finish(args, minimum_agno, error, true); +} + +/* + * Iterate from the agno indicated via @target through to the end of the + * filesystem attempting blocking allocation. This does not wrap or try a second + * pass, so will not recurse into AGs lower than indicated by the target. + */ +int +xfs_alloc_vextent_first_ag( + struct xfs_alloc_arg *args, + xfs_fsblock_t target) + { + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + xfs_agnumber_t start_agno; + int error; + + args->agno = NULLAGNUMBER; + args->agbno = NULLAGBLOCK; + error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); + error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, + XFS_FSB_TO_AGBNO(mp, target), 0); + return xfs_alloc_vextent_finish(args, minimum_agno, error, true); +} + +/* + * Allocate at the exact block target or fail. Caller is expected to hold a + * perag reference in args->pag. + */ +int +xfs_alloc_vextent_exact_bno( + struct xfs_alloc_arg *args, + xfs_fsblock_t target) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + int error; + + args->agno = XFS_FSB_TO_AGNO(mp, target); + args->agbno = XFS_FSB_TO_AGBNO(mp, target); + error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + error = xfs_alloc_vextent_prepare_ag(args); + if (!error && args->agbp) + error = xfs_alloc_ag_vextent_exact(args); + + return xfs_alloc_vextent_finish(args, minimum_agno, error, false); +} + +/* + * Allocate an extent as close to the target as possible. If there are not + * viable candidates in the AG, then fail the allocation. + * + * Caller may or may not have a per-ag reference in args->pag. + */ +int +xfs_alloc_vextent_near_bno( + struct xfs_alloc_arg *args, + xfs_fsblock_t target) +{ + struct xfs_mount *mp = args->mp; + xfs_agnumber_t minimum_agno; + bool needs_perag = args->pag == NULL; + int error; + + args->agno = XFS_FSB_TO_AGNO(mp, target); + args->agbno = XFS_FSB_TO_AGBNO(mp, target); + error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); + if (error) { + if (error == -ENOSPC) + return 0; + return error; + } + + if (needs_perag) + args->pag = xfs_perag_grab(mp, args->agno); + + error = xfs_alloc_vextent_prepare_ag(args); + if (!error && args->agbp) + error = xfs_alloc_ag_vextent_near(args); + + return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag); } /* Ensure that the freelist is at full capacity. */ diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 2c3f762dfb58..2b246d74c189 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -17,25 +17,6 @@ extern struct workqueue_struct *xfs_alloc_wq; unsigned int xfs_agfl_size(struct xfs_mount *mp); /* - * Freespace allocation types. Argument to xfs_alloc_[v]extent. - */ -#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */ -#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */ -#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */ -#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */ -#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */ - -/* this should become an enum again when the tracing code is fixed */ -typedef unsigned int xfs_alloctype_t; - -#define XFS_ALLOC_TYPES \ - { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \ - { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \ - { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \ - { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \ - { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" } - -/* * Flags for xfs_alloc_fix_freelist. */ #define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ @@ -68,8 +49,6 @@ typedef struct xfs_alloc_arg { xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */ xfs_agblock_t max_agbno; /* ... */ xfs_extlen_t len; /* output: actual size of extent */ - xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ - xfs_alloctype_t otype; /* original allocation type */ int datatype; /* mask defining data type treatment */ char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ @@ -118,11 +97,43 @@ xfs_alloc_log_agf( uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */ /* - * Allocate an extent (variable-size). + * Allocate an extent anywhere in the specific AG given. If there is no + * space matching the requirements in that AG, then the allocation will fail. */ -int /* error */ -xfs_alloc_vextent( - xfs_alloc_arg_t *args); /* allocation argument structure */ +int xfs_alloc_vextent_this_ag(struct xfs_alloc_arg *args, xfs_agnumber_t agno); + +/* + * Allocate an extent as close to the target as possible. If there are not + * viable candidates in the AG, then fail the allocation. + */ +int xfs_alloc_vextent_near_bno(struct xfs_alloc_arg *args, + xfs_fsblock_t target); + +/* + * Allocate an extent exactly at the target given. If this is not possible + * then the allocation fails. + */ +int xfs_alloc_vextent_exact_bno(struct xfs_alloc_arg *args, + xfs_fsblock_t target); + +/* + * Best effort full filesystem allocation scan. + * + * Locality aware allocation will be attempted in the initial AG, but on failure + * non-localised attempts will be made. The AGs are constrained by previous + * allocations in the current transaction. Two passes will be made - the first + * non-blocking, the second blocking. + */ +int xfs_alloc_vextent_start_ag(struct xfs_alloc_arg *args, + xfs_fsblock_t target); + +/* + * Iterate from the AG indicated from args->fsbno through to the end of the + * filesystem attempting blocking allocation. This is for use in last + * resort allocation attempts when everything else has failed. + */ +int xfs_alloc_vextent_first_ag(struct xfs_alloc_arg *args, + xfs_fsblock_t target); /* * Free an extent. diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 549a3cba0234..0f29c7b1b39f 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -315,7 +315,7 @@ xfs_allocbt_verify( level = be16_to_cpu(block->bb_level); if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) btnum = XFS_BTNUM_CNTi; - if (pag && pag->pagf_init) { + if (pag && xfs_perag_initialised_agf(pag)) { if (level >= pag->pagf_levels[btnum]) return __this_address; } else if (level >= mp->m_alloc_maxlevels) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0d56a8d862e8..34de6e6898c4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -645,34 +645,23 @@ xfs_bmap_extents_to_btree( args.tp = tp; args.mp = mp; xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, whichfork); - if (tp->t_firstblock == NULLFSBLOCK) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); - } else if (tp->t_flags & XFS_TRANS_LOWMODE) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = tp->t_firstblock; - } else { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = tp->t_firstblock; - } + args.minlen = args.maxlen = args.prod = 1; args.wasdel = wasdel; *logflagsp = 0; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(mp, ip->i_ino)); if (error) goto out_root_realloc; + /* + * Allocation can't fail, the space was reserved. + */ if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { error = -ENOSPC; goto out_root_realloc; } - /* - * Allocation can't fail, the space was reserved. - */ - ASSERT(tp->t_firstblock == NULLFSBLOCK || - args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock)); - tp->t_firstblock = args.fsbno; cur->bc_ino.allocated++; ip->i_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); @@ -799,28 +788,24 @@ xfs_bmap_local_to_extents( memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; + args.total = total; + args.minlen = args.maxlen = args.prod = 1; xfs_rmap_ino_owner(&args.oinfo, ip->i_ino, whichfork, 0); + /* * Allocate a block. We know we need only one, since the * file currently fits in an inode. */ - if (tp->t_firstblock == NULLFSBLOCK) { - args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); - args.type = XFS_ALLOCTYPE_START_BNO; - } else { - args.fsbno = tp->t_firstblock; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - } args.total = total; args.minlen = args.maxlen = args.prod = 1; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(args.mp, ip->i_ino)); if (error) goto done; /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); - tp->t_firstblock = args.fsbno; error = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, XFS_FSB_TO_DADDR(args.mp, args.fsbno), args.mp->m_bsize, 0, &bp); @@ -854,8 +839,7 @@ xfs_bmap_local_to_extents( ifp->if_nextents = 1; ip->i_nblocks = 1; - xfs_trans_mod_dquot_byino(tp, ip, - XFS_TRANS_DQ_BCOUNT, 1L); + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); flags |= xfs_ilog_fext(whichfork); done: @@ -3025,9 +3009,7 @@ xfs_bmap_adjacent( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { xfs_fsblock_t adjust; /* adjustment to block numbers */ - xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ xfs_mount_t *mp; /* mount point structure */ - int nullfb; /* true if ap->firstblock isn't set */ int rt; /* true if inode is realtime */ #define ISVALID(x,y) \ @@ -3038,11 +3020,8 @@ xfs_bmap_adjacent( XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) mp = ap->ip->i_mount; - nullfb = ap->tp->t_firstblock == NULLFSBLOCK; rt = XFS_IS_REALTIME_INODE(ap->ip) && (ap->datatype & XFS_ALLOC_USERDATA); - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, - ap->tp->t_firstblock); /* * If allocating at eof, and there's a previous real block, * try to use its last block as our starting point. @@ -3101,13 +3080,6 @@ xfs_bmap_adjacent( prevbno += adjust; else prevdiff += adjust; - /* - * If the firstblock forbids it, can't use it, - * must use default. - */ - if (!rt && !nullfb && - XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno) - prevbno = NULLFSBLOCK; } /* * No previous block or can't follow it, just default. @@ -3143,13 +3115,6 @@ xfs_bmap_adjacent( gotdiff += adjust - ap->length; } else gotdiff += adjust; - /* - * If the firstblock forbids it, can't use it, - * must use default. - */ - if (!rt && !nullfb && - XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno) - gotbno = NULLFSBLOCK; } /* * No next block, just default. @@ -3170,147 +3135,91 @@ xfs_bmap_adjacent( #undef ISVALID } -static int +int xfs_bmap_longest_free_extent( + struct xfs_perag *pag, struct xfs_trans *tp, - xfs_agnumber_t ag, - xfs_extlen_t *blen, - int *notinit) + xfs_extlen_t *blen) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; xfs_extlen_t longest; int error = 0; - pag = xfs_perag_get(mp, ag); - if (!pag->pagf_init) { + if (!xfs_perag_initialised_agf(pag)) { error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK, NULL); - if (error) { - /* Couldn't lock the AGF, so skip this AG. */ - if (error == -EAGAIN) { - *notinit = 1; - error = 0; - } - goto out; - } + if (error) + return error; } longest = xfs_alloc_longest_free_extent(pag, - xfs_alloc_min_freelist(mp, pag), + xfs_alloc_min_freelist(pag->pag_mount, pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) *blen = longest; -out: - xfs_perag_put(pag); - return error; + return 0; } -static void +static xfs_extlen_t xfs_bmap_select_minlen( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args, - xfs_extlen_t *blen, - int notinit) + xfs_extlen_t blen) { - if (notinit || *blen < ap->minlen) { - /* - * Since we did a BUF_TRYLOCK above, it is possible that - * there is space for this request. - */ - args->minlen = ap->minlen; - } else if (*blen < args->maxlen) { - /* - * If the best seen length is less than the request length, - * use the best as the minimum. - */ - args->minlen = *blen; - } else { - /* - * Otherwise we've seen an extent as big as maxlen, use that - * as the minimum. - */ - args->minlen = args->maxlen; - } -} - -STATIC int -xfs_bmap_btalloc_nullfb( - struct xfs_bmalloca *ap, - struct xfs_alloc_arg *args, - xfs_extlen_t *blen) -{ - struct xfs_mount *mp = ap->ip->i_mount; - xfs_agnumber_t ag, startag; - int notinit = 0; - int error; - - args->type = XFS_ALLOCTYPE_START_BNO; - args->total = ap->total; - startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno); - if (startag == NULLAGNUMBER) - startag = ag = 0; - - while (*blen < args->maxlen) { - error = xfs_bmap_longest_free_extent(args->tp, ag, blen, - ¬init); - if (error) - return error; - - if (++ag == mp->m_sb.sb_agcount) - ag = 0; - if (ag == startag) - break; - } + /* + * Since we used XFS_ALLOC_FLAG_TRYLOCK in _longest_free_extent(), it is + * possible that there is enough contiguous free space for this request. + */ + if (blen < ap->minlen) + return ap->minlen; - xfs_bmap_select_minlen(ap, args, blen, notinit); - return 0; + /* + * If the best seen length is less than the request length, + * use the best as the minimum, otherwise we've got the maxlen we + * were asked for. + */ + if (blen < args->maxlen) + return blen; + return args->maxlen; } -STATIC int -xfs_bmap_btalloc_filestreams( +static int +xfs_bmap_btalloc_select_lengths( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args, xfs_extlen_t *blen) { - struct xfs_mount *mp = ap->ip->i_mount; - xfs_agnumber_t ag; - int notinit = 0; - int error; - - args->type = XFS_ALLOCTYPE_NEAR_BNO; - args->total = ap->total; - - ag = XFS_FSB_TO_AGNO(mp, args->fsbno); - if (ag == NULLAGNUMBER) - ag = 0; - - error = xfs_bmap_longest_free_extent(args->tp, ag, blen, ¬init); - if (error) - return error; + struct xfs_mount *mp = args->mp; + struct xfs_perag *pag; + xfs_agnumber_t agno, startag; + int error = 0; - if (*blen < args->maxlen) { - error = xfs_filestream_new_ag(ap, &ag); - if (error) - return error; + if (ap->tp->t_flags & XFS_TRANS_LOWMODE) { + args->total = ap->minlen; + args->minlen = ap->minlen; + return 0; + } - error = xfs_bmap_longest_free_extent(args->tp, ag, blen, - ¬init); - if (error) - return error; + args->total = ap->total; + startag = XFS_FSB_TO_AGNO(mp, ap->blkno); + if (startag == NULLAGNUMBER) + startag = 0; + *blen = 0; + for_each_perag_wrap(mp, startag, agno, pag) { + error = xfs_bmap_longest_free_extent(pag, args->tp, blen); + if (error && error != -EAGAIN) + break; + error = 0; + if (*blen >= args->maxlen) + break; } + if (pag) + xfs_perag_rele(pag); - xfs_bmap_select_minlen(ap, args, blen, notinit); - - /* - * Set the failure fallback case to look in the selected AG as stream - * may have moved. - */ - ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); - return 0; + args->minlen = xfs_bmap_select_minlen(ap, args, *blen); + return error; } /* Update all inode and quota accounting for the allocation we just did. */ @@ -3413,21 +3322,7 @@ xfs_bmap_process_allocated_extent( xfs_fileoff_t orig_offset, xfs_extlen_t orig_length) { - int nullfb; - - nullfb = ap->tp->t_firstblock == NULLFSBLOCK; - - /* - * check the allocation happened at the same or higher AG than - * the first block that was allocated. - */ - ASSERT(nullfb || - XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <= - XFS_FSB_TO_AGNO(args->mp, args->fsbno)); - ap->blkno = args->fsbno; - if (nullfb) - ap->tp->t_firstblock = args->fsbno; ap->length = args->len; /* * If the extent size hint is active, we tried to round the @@ -3474,23 +3369,17 @@ xfs_bmap_exact_minlen_extent_alloc( xfs_bmap_compute_alignments(ap, &args); - if (ap->tp->t_firstblock == NULLFSBLOCK) { - /* - * Unlike the longest extent available in an AG, we don't track - * the length of an AG's shortest extent. - * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and - * hence we can afford to start traversing from the 0th AG since - * we need not be concerned about a drop in performance in - * "debug only" code paths. - */ - ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0); - } else { - ap->blkno = ap->tp->t_firstblock; - } + /* + * Unlike the longest extent available in an AG, we don't track + * the length of an AG's shortest extent. + * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and + * hence we can afford to start traversing from the 0th AG since + * we need not be concerned about a drop in performance in + * "debug only" code paths. + */ + ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0); - args.fsbno = ap->blkno; args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; - args.type = XFS_ALLOCTYPE_FIRST_AG; args.minlen = args.maxlen = ap->minlen; args.total = ap->total; @@ -3502,7 +3391,7 @@ xfs_bmap_exact_minlen_extent_alloc( args.resv = XFS_AG_RESV_NONE; args.datatype = ap->datatype; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_first_ag(&args, ap->blkno); if (error) return error; @@ -3522,193 +3411,270 @@ xfs_bmap_exact_minlen_extent_alloc( #endif -STATIC int -xfs_bmap_btalloc( - struct xfs_bmalloca *ap) +/* + * If we are not low on available data blocks and we are allocating at + * EOF, optimise allocation for contiguous file extension and/or stripe + * alignment of the new extent. + * + * NOTE: ap->aeof is only set if the allocation length is >= the + * stripe unit and the allocation offset is at the end of file. + */ +static int +xfs_bmap_btalloc_at_eof( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t blen, + int stripe_align, + bool ag_only) { - struct xfs_mount *mp = ap->ip->i_mount; - struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp }; - xfs_alloctype_t atype = 0; - xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ - xfs_agnumber_t ag; - xfs_fileoff_t orig_offset; - xfs_extlen_t orig_length; - xfs_extlen_t blen; - xfs_extlen_t nextminlen = 0; - int nullfb; /* true if ap->firstblock isn't set */ - int isaligned; - int tryagain; + struct xfs_mount *mp = args->mp; + struct xfs_perag *caller_pag = args->pag; int error; - int stripe_align; - - ASSERT(ap->length); - orig_offset = ap->offset; - orig_length = ap->length; - - stripe_align = xfs_bmap_compute_alignments(ap, &args); - - nullfb = ap->tp->t_firstblock == NULLFSBLOCK; - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, - ap->tp->t_firstblock); - if (nullfb) { - if ((ap->datatype & XFS_ALLOC_USERDATA) && - xfs_inode_is_filestream(ap->ip)) { - ag = xfs_filestream_lookup_ag(ap->ip); - ag = (ag != NULLAGNUMBER) ? ag : 0; - ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); - } else { - ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino); - } - } else - ap->blkno = ap->tp->t_firstblock; - - xfs_bmap_adjacent(ap); /* - * If allowed, use ap->blkno; otherwise must use firstblock since - * it's in the right allocation group. - */ - if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno) - ; - else - ap->blkno = ap->tp->t_firstblock; - /* - * Normal allocation, done through xfs_alloc_vextent. + * If there are already extents in the file, try an exact EOF block + * allocation to extend the file as a contiguous extent. If that fails, + * or it's the first allocation in a file, just try for a stripe aligned + * allocation. */ - tryagain = isaligned = 0; - args.fsbno = ap->blkno; - args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; + if (ap->offset) { + xfs_extlen_t nextminlen = 0; - /* Trim the allocation back to the maximum an AG can fit. */ - args.maxlen = min(ap->length, mp->m_ag_max_usable); - blen = 0; - if (nullfb) { /* - * Search for an allocation group with a single extent large - * enough for the request. If one isn't found, then adjust - * the minimum allocation size to the largest space found. + * Compute the minlen+alignment for the next case. Set slop so + * that the value of minlen+alignment+slop doesn't go up between + * the calls. */ - if ((ap->datatype & XFS_ALLOC_USERDATA) && - xfs_inode_is_filestream(ap->ip)) - error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); + args->alignment = 1; + if (blen > stripe_align && blen <= args->maxlen) + nextminlen = blen - stripe_align; else - error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); + nextminlen = args->minlen; + if (nextminlen + stripe_align > args->minlen + 1) + args->minalignslop = nextminlen + stripe_align - + args->minlen - 1; + else + args->minalignslop = 0; + + if (!caller_pag) + args->pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ap->blkno)); + error = xfs_alloc_vextent_exact_bno(args, ap->blkno); + if (!caller_pag) + xfs_perag_put(args->pag); if (error) return error; - } else if (ap->tp->t_flags & XFS_TRANS_LOWMODE) { - if (xfs_inode_is_filestream(ap->ip)) - args.type = XFS_ALLOCTYPE_FIRST_AG; - else - args.type = XFS_ALLOCTYPE_START_BNO; - args.total = args.minlen = ap->minlen; + + if (args->fsbno != NULLFSBLOCK) + return 0; + /* + * Exact allocation failed. Reset to try an aligned allocation + * according to the original allocation specification. + */ + args->pag = NULL; + args->alignment = stripe_align; + args->minlen = nextminlen; + args->minalignslop = 0; } else { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.total = ap->total; - args.minlen = ap->minlen; + /* + * Adjust minlen to try and preserve alignment if we + * can't guarantee an aligned maxlen extent. + */ + args->alignment = stripe_align; + if (blen > args->alignment && + blen <= args->maxlen + args->alignment) + args->minlen = blen - args->alignment; + args->minalignslop = 0; } - /* - * If we are not low on available data blocks, and the underlying - * logical volume manager is a stripe, and the file offset is zero then - * try to allocate data blocks on stripe unit boundary. NOTE: ap->aeof - * is only set if the allocation length is >= the stripe unit and the - * allocation offset is at the end of file. - */ - if (!(ap->tp->t_flags & XFS_TRANS_LOWMODE) && ap->aeof) { - if (!ap->offset) { - args.alignment = stripe_align; - atype = args.type; - isaligned = 1; - /* - * Adjust minlen to try and preserve alignment if we - * can't guarantee an aligned maxlen extent. - */ - if (blen > args.alignment && - blen <= args.maxlen + args.alignment) - args.minlen = blen - args.alignment; - args.minalignslop = 0; - } else { - /* - * First try an exact bno allocation. - * If it fails then do a near or start bno - * allocation with alignment turned on. - */ - atype = args.type; - tryagain = 1; - args.type = XFS_ALLOCTYPE_THIS_BNO; - args.alignment = 1; - /* - * Compute the minlen+alignment for the - * next case. Set slop so that the value - * of minlen+alignment+slop doesn't go up - * between the calls. - */ - if (blen > stripe_align && blen <= args.maxlen) - nextminlen = blen - stripe_align; - else - nextminlen = args.minlen; - if (nextminlen + stripe_align > args.minlen + 1) - args.minalignslop = - nextminlen + stripe_align - - args.minlen - 1; - else - args.minalignslop = 0; - } + if (ag_only) { + error = xfs_alloc_vextent_near_bno(args, ap->blkno); } else { - args.alignment = 1; - args.minalignslop = 0; + args->pag = NULL; + error = xfs_alloc_vextent_start_ag(args, ap->blkno); + ASSERT(args->pag == NULL); + args->pag = caller_pag; } - args.minleft = ap->minleft; - args.wasdel = ap->wasdel; - args.resv = XFS_AG_RESV_NONE; - args.datatype = ap->datatype; - - error = xfs_alloc_vextent(&args); if (error) return error; - if (tryagain && args.fsbno == NULLFSBLOCK) { - /* - * Exact allocation failed. Now try with alignment - * turned on. - */ - args.type = atype; - args.fsbno = ap->blkno; - args.alignment = stripe_align; - args.minlen = nextminlen; - args.minalignslop = 0; - isaligned = 1; - if ((error = xfs_alloc_vextent(&args))) - return error; - } - if (isaligned && args.fsbno == NULLFSBLOCK) { - /* - * allocation failed, so turn off alignment and - * try again. - */ - args.type = atype; - args.fsbno = ap->blkno; - args.alignment = 0; - if ((error = xfs_alloc_vextent(&args))) + if (args->fsbno != NULLFSBLOCK) + return 0; + + /* + * Allocation failed, so turn return the allocation args to their + * original non-aligned state so the caller can proceed on allocation + * failure as if this function was never called. + */ + args->fsbno = ap->blkno; + args->alignment = 1; + return 0; +} + +/* + * We have failed multiple allocation attempts so now are in a low space + * allocation situation. Try a locality first full filesystem minimum length + * allocation whilst still maintaining necessary total block reservation + * requirements. + * + * If that fails, we are now critically low on space, so perform a last resort + * allocation attempt: no reserve, no locality, blocking, minimum length, full + * filesystem free space scan. We also indicate to future allocations in this + * transaction that we are critically low on space so they don't waste time on + * allocation modes that are unlikely to succeed. + */ +int +xfs_bmap_btalloc_low_space( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args) +{ + int error; + + if (args->minlen > ap->minlen) { + args->minlen = ap->minlen; + error = xfs_alloc_vextent_start_ag(args, ap->blkno); + if (error || args->fsbno != NULLFSBLOCK) return error; } - if (args.fsbno == NULLFSBLOCK && nullfb && - args.minlen > ap->minlen) { - args.minlen = ap->minlen; - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = ap->blkno; - if ((error = xfs_alloc_vextent(&args))) - return error; + + /* Last ditch attempt before failure is declared. */ + args->total = ap->minlen; + error = xfs_alloc_vextent_first_ag(args, 0); + if (error) + return error; + ap->tp->t_flags |= XFS_TRANS_LOWMODE; + return 0; +} + +static int +xfs_bmap_btalloc_filestreams( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + int stripe_align) +{ + xfs_extlen_t blen = 0; + int error = 0; + + + error = xfs_filestream_select_ag(ap, args, &blen); + if (error) + return error; + ASSERT(args->pag); + + /* + * If we are in low space mode, then optimal allocation will fail so + * prepare for minimal allocation and jump to the low space algorithm + * immediately. + */ + if (ap->tp->t_flags & XFS_TRANS_LOWMODE) { + args->minlen = ap->minlen; + ASSERT(args->fsbno == NULLFSBLOCK); + goto out_low_space; } - if (args.fsbno == NULLFSBLOCK && nullfb) { - args.fsbno = 0; - args.type = XFS_ALLOCTYPE_FIRST_AG; - args.total = ap->minlen; - if ((error = xfs_alloc_vextent(&args))) + + args->minlen = xfs_bmap_select_minlen(ap, args, blen); + if (ap->aeof) + error = xfs_bmap_btalloc_at_eof(ap, args, blen, stripe_align, + true); + + if (!error && args->fsbno == NULLFSBLOCK) + error = xfs_alloc_vextent_near_bno(args, ap->blkno); + +out_low_space: + /* + * We are now done with the perag reference for the filestreams + * association provided by xfs_filestream_select_ag(). Release it now as + * we've either succeeded, had a fatal error or we are out of space and + * need to do a full filesystem scan for free space which will take it's + * own references. + */ + xfs_perag_rele(args->pag); + args->pag = NULL; + if (error || args->fsbno != NULLFSBLOCK) + return error; + + return xfs_bmap_btalloc_low_space(ap, args); +} + +static int +xfs_bmap_btalloc_best_length( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + int stripe_align) +{ + xfs_extlen_t blen = 0; + int error; + + ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino); + xfs_bmap_adjacent(ap); + + /* + * Search for an allocation group with a single extent large enough for + * the request. If one isn't found, then adjust the minimum allocation + * size to the largest space found. + */ + error = xfs_bmap_btalloc_select_lengths(ap, args, &blen); + if (error) + return error; + + /* + * Don't attempt optimal EOF allocation if previous allocations barely + * succeeded due to being near ENOSPC. It is highly unlikely we'll get + * optimal or even aligned allocations in this case, so don't waste time + * trying. + */ + if (ap->aeof && !(ap->tp->t_flags & XFS_TRANS_LOWMODE)) { + error = xfs_bmap_btalloc_at_eof(ap, args, blen, stripe_align, + false); + if (error || args->fsbno != NULLFSBLOCK) return error; - ap->tp->t_flags |= XFS_TRANS_LOWMODE; } + error = xfs_alloc_vextent_start_ag(args, ap->blkno); + if (error || args->fsbno != NULLFSBLOCK) + return error; + + return xfs_bmap_btalloc_low_space(ap, args); +} + +static int +xfs_bmap_btalloc( + struct xfs_bmalloca *ap) +{ + struct xfs_mount *mp = ap->ip->i_mount; + struct xfs_alloc_arg args = { + .tp = ap->tp, + .mp = mp, + .fsbno = NULLFSBLOCK, + .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, + .minleft = ap->minleft, + .wasdel = ap->wasdel, + .resv = XFS_AG_RESV_NONE, + .datatype = ap->datatype, + .alignment = 1, + .minalignslop = 0, + }; + xfs_fileoff_t orig_offset; + xfs_extlen_t orig_length; + int error; + int stripe_align; + + ASSERT(ap->length); + orig_offset = ap->offset; + orig_length = ap->length; + + stripe_align = xfs_bmap_compute_alignments(ap, &args); + + /* Trim the allocation back to the maximum an AG can fit. */ + args.maxlen = min(ap->length, mp->m_ag_max_usable); + + if ((ap->datatype & XFS_ALLOC_USERDATA) && + xfs_inode_is_filestream(ap->ip)) + error = xfs_bmap_btalloc_filestreams(ap, &args, stripe_align); + else + error = xfs_bmap_btalloc_best_length(ap, &args, stripe_align); + if (error) + return error; + if (args.fsbno != NULLFSBLOCK) { xfs_bmap_process_allocated_extent(ap, &args, orig_offset, orig_length); @@ -4256,7 +4222,7 @@ xfs_bmapi_convert_unwritten( return 0; } -static inline xfs_extlen_t +xfs_extlen_t xfs_bmapi_minleft( struct xfs_trans *tp, struct xfs_inode *ip, @@ -4264,7 +4230,7 @@ xfs_bmapi_minleft( { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, fork); - if (tp && tp->t_firstblock != NULLFSBLOCK) + if (tp && tp->t_highest_agno != NULLAGNUMBER) return 0; if (ifp->if_format != XFS_DINODE_FMT_BTREE) return 1; @@ -6146,39 +6112,37 @@ xfs_bmap_unmap_extent( int xfs_bmap_finish_one( struct xfs_trans *tp, - struct xfs_inode *ip, - enum xfs_bmap_intent_type type, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t *blockcount, - xfs_exntst_t state) + struct xfs_bmap_intent *bi) { + struct xfs_bmbt_irec *bmap = &bi->bi_bmap; int error = 0; - ASSERT(tp->t_firstblock == NULLFSBLOCK); + ASSERT(tp->t_highest_agno == NULLAGNUMBER); trace_xfs_bmap_deferred(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, - XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), - ip->i_ino, whichfork, startoff, *blockcount, state); + XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), + bi->bi_type, + XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), + bi->bi_owner->i_ino, bi->bi_whichfork, + bmap->br_startoff, bmap->br_blockcount, + bmap->br_state); - if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK)) + if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK)) return -EFSCORRUPTED; if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; - switch (type) { + switch (bi->bi_type) { case XFS_BMAP_MAP: - error = xfs_bmapi_remap(tp, ip, startoff, *blockcount, - startblock, 0); - *blockcount = 0; + error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff, + bmap->br_blockcount, bmap->br_startblock, 0); + bmap->br_blockcount = 0; break; case XFS_BMAP_UNMAP: - error = __xfs_bunmapi(tp, ip, startoff, blockcount, - XFS_BMAPI_REMAP, 1); + error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff, + &bmap->br_blockcount, XFS_BMAPI_REMAP, 1); break; default: ASSERT(0); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 16db95b11589..dd08361ca5a6 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -12,6 +12,7 @@ struct xfs_ifork; struct xfs_inode; struct xfs_mount; struct xfs_trans; +struct xfs_alloc_arg; /* * Argument structure for xfs_bmap_alloc. @@ -168,6 +169,8 @@ static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) #define xfs_valid_startblock(ip, startblock) \ ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip)) +int xfs_bmap_longest_free_extent(struct xfs_perag *pag, + struct xfs_trans *tp, xfs_extlen_t *blen); void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); @@ -220,6 +223,10 @@ int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp); +xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip, + int fork); +int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, @@ -234,10 +241,7 @@ struct xfs_bmap_intent { struct xfs_bmbt_irec bi_bmap; }; -int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip, - enum xfs_bmap_intent_type type, int whichfork, - xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t *blockcount, xfs_exntst_t state); +int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index cfa052d40105..b8ad95050c9b 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -21,6 +21,7 @@ #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_rmap.h" +#include "xfs_ag.h" static struct kmem_cache *xfs_bmbt_cur_cache; @@ -184,11 +185,11 @@ xfs_bmbt_update_cursor( struct xfs_btree_cur *src, struct xfs_btree_cur *dst) { - ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) || + ASSERT((dst->bc_tp->t_highest_agno != NULLAGNUMBER) || (dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME)); dst->bc_ino.allocated += src->bc_ino.allocated; - dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock; + dst->bc_tp->t_highest_agno = src->bc_tp->t_highest_agno; src->bc_ino.allocated = 0; } @@ -200,46 +201,32 @@ xfs_bmbt_alloc_block( union xfs_btree_ptr *new, int *stat) { - xfs_alloc_arg_t args; /* block allocation args */ - int error; /* error return value */ + struct xfs_alloc_arg args; + int error; memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.fsbno = cur->bc_tp->t_firstblock; xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino, cur->bc_ino.whichfork); - - if (args.fsbno == NULLFSBLOCK) { - args.fsbno = be64_to_cpu(start->l); - args.type = XFS_ALLOCTYPE_START_BNO; - /* - * Make sure there is sufficient room left in the AG to - * complete a full tree split for an extent insert. If - * we are converting the middle part of an extent then - * we may need space for two tree splits. - * - * We are relying on the caller to make the correct block - * reservation for this operation to succeed. If the - * reservation amount is insufficient then we may fail a - * block allocation here and corrupt the filesystem. - */ - args.minleft = args.tp->t_blk_res; - } else if (cur->bc_tp->t_flags & XFS_TRANS_LOWMODE) { - args.type = XFS_ALLOCTYPE_START_BNO; - } else { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - } - args.minlen = args.maxlen = args.prod = 1; args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL; - if (!args.wasdel && args.tp->t_blk_res == 0) { - error = -ENOSPC; - goto error0; - } - error = xfs_alloc_vextent(&args); + if (!args.wasdel && args.tp->t_blk_res == 0) + return -ENOSPC; + + /* + * If we are coming here from something like unwritten extent + * conversion, there has been no data extent allocation already done, so + * we have to ensure that we attempt to locate the entire set of bmbt + * allocations in the same AG, as xfs_bmapi_write() would have reserved. + */ + if (cur->bc_tp->t_highest_agno == NULLAGNUMBER) + args.minleft = xfs_bmapi_minleft(cur->bc_tp, cur->bc_ino.ip, + cur->bc_ino.whichfork); + + error = xfs_alloc_vextent_start_ag(&args, be64_to_cpu(start->l)); if (error) - goto error0; + return error; if (args.fsbno == NULLFSBLOCK && args.minleft) { /* @@ -247,11 +234,10 @@ xfs_bmbt_alloc_block( * a full btree split. Try again and if * successful activate the lowspace algorithm. */ - args.fsbno = 0; - args.type = XFS_ALLOCTYPE_FIRST_AG; - error = xfs_alloc_vextent(&args); + args.minleft = 0; + error = xfs_alloc_vextent_start_ag(&args, 0); if (error) - goto error0; + return error; cur->bc_tp->t_flags |= XFS_TRANS_LOWMODE; } if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { @@ -260,7 +246,6 @@ xfs_bmbt_alloc_block( } ASSERT(args.len == 1); - cur->bc_tp->t_firstblock = args.fsbno; cur->bc_ino.allocated++; cur->bc_ino.ip->i_nblocks++; xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE); @@ -271,9 +256,6 @@ xfs_bmbt_alloc_block( *stat = 1; return 0; - - error0: - return error; } STATIC int diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 35f574421670..c4649cc624e1 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2913,9 +2913,22 @@ xfs_btree_split_worker( } /* - * BMBT split requests often come in with little stack to work on. Push + * BMBT split requests often come in with little stack to work on so we push * them off to a worker thread so there is lots of stack to use. For the other * btree types, just call directly to avoid the context switch overhead here. + * + * Care must be taken here - the work queue rescuer thread introduces potential + * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new + * AGFs to allocate blocks. A task being run by the rescuer could attempt to + * lock an AGF that is already locked by a task queued to run by the rescuer, + * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to + * release it until the current thread it is running gains the lock. + * + * To avoid this issue, we only ever queue BMBT splits that don't have an AGF + * already locked to allocate from. The only place that doesn't hold an AGF + * locked is unwritten extent conversion at IO completion, but that has already + * been offloaded to a worker thread and hence has no stack consumption issues + * we have to worry about. */ STATIC int /* error */ xfs_btree_split( @@ -2929,7 +2942,8 @@ xfs_btree_split( struct xfs_btree_split_args args; DECLARE_COMPLETION_ONSTACK(done); - if (cur->bc_btnum != XFS_BTNUM_BMAP) + if (cur->bc_btnum != XFS_BTNUM_BMAP || + cur->bc_tp->t_highest_agno == NULLAGNUMBER) return __xfs_btree_split(cur, level, ptrp, key, curp, stat); args.cur = cur; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 5118dedf9267..7ee292aecbeb 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -169,10 +169,9 @@ xfs_inobt_insert_rec( */ STATIC int xfs_inobt_insert( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_agino_t newino, xfs_agino_t newlen, xfs_btnum_t btnum) @@ -182,7 +181,7 @@ xfs_inobt_insert( int i; int error; - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum); + cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); for (thisino = newino; thisino < newino + newlen; @@ -514,20 +513,20 @@ __xfs_inobt_rec_merge( */ STATIC int xfs_inobt_insert_sprec( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, int btnum, struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ bool merge) /* merge or replace */ { + struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; int error; int i; struct xfs_inobt_rec_incore rec; - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum); + cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); @@ -609,9 +608,9 @@ error: */ STATIC int xfs_ialloc_ag_alloc( + struct xfs_perag *pag, struct xfs_trans *tp, - struct xfs_buf *agbp, - struct xfs_perag *pag) + struct xfs_buf *agbp) { struct xfs_agi *agi; struct xfs_alloc_arg args; @@ -631,6 +630,7 @@ xfs_ialloc_ag_alloc( args.mp = tp->t_mountp; args.fsbno = NULLFSBLOCK; args.oinfo = XFS_RMAP_OINFO_INODES; + args.pag = pag; #ifdef DEBUG /* randomly do sparse inode allocations */ @@ -662,8 +662,6 @@ xfs_ialloc_ag_alloc( goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { - args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); - args.type = XFS_ALLOCTYPE_THIS_BNO; args.prod = 1; /* @@ -684,7 +682,10 @@ xfs_ialloc_ag_alloc( /* Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; - if ((error = xfs_alloc_vextent(&args))) + error = xfs_alloc_vextent_exact_bno(&args, + XFS_AGB_TO_FSB(args.mp, pag->pag_agno, + args.agbno)); + if (error) return error; /* @@ -717,22 +718,17 @@ xfs_ialloc_ag_alloc( } else args.alignment = igeo->cluster_align; /* - * Need to figure out where to allocate the inode blocks. - * Ideally they should be spaced out through the a.g. - * For now, just allocate blocks up front. - */ - args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); - /* * Allocate a fixed-size extent of inodes. */ - args.type = XFS_ALLOCTYPE_NEAR_BNO; args.prod = 1; /* * Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; - if ((error = xfs_alloc_vextent(&args))) + error = xfs_alloc_vextent_near_bno(&args, + XFS_AGB_TO_FSB(args.mp, pag->pag_agno, + be32_to_cpu(agi->agi_root))); + if (error) return error; } @@ -741,11 +737,11 @@ xfs_ialloc_ag_alloc( * alignment. */ if (isaligned && args.fsbno == NULLFSBLOCK) { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.alignment = igeo->cluster_align; - if ((error = xfs_alloc_vextent(&args))) + error = xfs_alloc_vextent_near_bno(&args, + XFS_AGB_TO_FSB(args.mp, pag->pag_agno, + be32_to_cpu(agi->agi_root))); + if (error) return error; } @@ -757,9 +753,6 @@ xfs_ialloc_ag_alloc( igeo->ialloc_min_blks < igeo->ialloc_blks && args.fsbno == NULLFSBLOCK) { sparse_alloc: - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.alignment = args.mp->m_sb.sb_spino_align; args.prod = 1; @@ -781,7 +774,9 @@ sparse_alloc: args.mp->m_sb.sb_inoalignmt) - igeo->ialloc_blks; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_near_bno(&args, + XFS_AGB_TO_FSB(args.mp, pag->pag_agno, + be32_to_cpu(agi->agi_root))); if (error) return error; @@ -831,7 +826,7 @@ sparse_alloc: * if necessary. If a merge does occur, rec is updated to the * merged record. */ - error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, + error = xfs_inobt_insert_sprec(pag, tp, agbp, XFS_BTNUM_INO, &rec, true); if (error == -EFSCORRUPTED) { xfs_alert(args.mp, @@ -856,20 +851,20 @@ sparse_alloc: * existing record with this one. */ if (xfs_has_finobt(args.mp)) { - error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, + error = xfs_inobt_insert_sprec(pag, tp, agbp, XFS_BTNUM_FINO, &rec, false); if (error) return error; } } else { /* full chunk - insert new records to both btrees */ - error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen, + error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, XFS_BTNUM_INO); if (error) return error; if (xfs_has_finobt(args.mp)) { - error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, + error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, XFS_BTNUM_FINO); if (error) return error; @@ -981,9 +976,9 @@ xfs_inobt_first_free_inode( */ STATIC int xfs_dialloc_ag_inobt( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_ino_t parent, xfs_ino_t *inop) { @@ -999,12 +994,12 @@ xfs_dialloc_ag_inobt( int i, j; int searchdistance = 10; - ASSERT(pag->pagi_init); - ASSERT(pag->pagi_inodeok); + ASSERT(xfs_perag_initialised_agi(pag)); + ASSERT(xfs_perag_allows_inodes(pag)); ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1429,9 +1424,9 @@ xfs_dialloc_ag_update_inobt( */ static int xfs_dialloc_ag( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_ino_t parent, xfs_ino_t *inop) { @@ -1448,7 +1443,7 @@ xfs_dialloc_ag( int i; if (!xfs_has_finobt(mp)) - return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop); + return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop); /* * If pagino is 0 (this is the root inode allocation) use newino. @@ -1457,7 +1452,7 @@ xfs_dialloc_ag( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); error = xfs_check_agi_freecount(cur); if (error) @@ -1500,7 +1495,7 @@ xfs_dialloc_ag( * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ - icur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); + icur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); error = xfs_check_agi_freecount(icur); if (error) @@ -1577,25 +1572,10 @@ xfs_dialloc_roll( return error; } -static xfs_agnumber_t -xfs_ialloc_next_ag( - xfs_mount_t *mp) -{ - xfs_agnumber_t agno; - - spin_lock(&mp->m_agirotor_lock); - agno = mp->m_agirotor; - if (++mp->m_agirotor >= mp->m_maxagi) - mp->m_agirotor = 0; - spin_unlock(&mp->m_agirotor_lock); - - return agno; -} - static bool xfs_dialloc_good_ag( - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, umode_t mode, int flags, bool ok_alloc) @@ -1606,10 +1586,12 @@ xfs_dialloc_good_ag( int needspace; int error; - if (!pag->pagi_inodeok) + if (!pag) + return false; + if (!xfs_perag_allows_inodes(pag)) return false; - if (!pag->pagi_init) { + if (!xfs_perag_initialised_agi(pag)) { error = xfs_ialloc_read_agi(pag, tp, NULL); if (error) return false; @@ -1620,7 +1602,7 @@ xfs_dialloc_good_ag( if (!ok_alloc) return false; - if (!pag->pagf_init) { + if (!xfs_perag_initialised_agf(pag)) { error = xfs_alloc_read_agf(pag, tp, flags, NULL); if (error) return false; @@ -1665,8 +1647,8 @@ xfs_dialloc_good_ag( static int xfs_dialloc_try_ag( - struct xfs_trans **tpp, struct xfs_perag *pag, + struct xfs_trans **tpp, xfs_ino_t parent, xfs_ino_t *new_ino, bool ok_alloc) @@ -1689,7 +1671,7 @@ xfs_dialloc_try_ag( goto out_release; } - error = xfs_ialloc_ag_alloc(*tpp, agbp, pag); + error = xfs_ialloc_ag_alloc(pag, *tpp, agbp); if (error < 0) goto out_release; @@ -1705,7 +1687,7 @@ xfs_dialloc_try_ag( } /* Allocate an inode in the found AG */ - error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino); + error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino); if (!error) *new_ino = ino; return error; @@ -1737,8 +1719,9 @@ xfs_dialloc( struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); bool ok_alloc = true; + bool low_space = false; int flags; - xfs_ino_t ino; + xfs_ino_t ino = NULLFSINO; /* * Directories, symlinks, and regular files frequently allocate at least @@ -1746,7 +1729,8 @@ xfs_dialloc( * an AG has enough space for file creation. */ if (S_ISDIR(mode)) - start_agno = xfs_ialloc_next_ag(mp); + start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) % + mp->m_maxagi; else { start_agno = XFS_INO_TO_AGNO(mp, parent); if (start_agno >= mp->m_maxagi) @@ -1768,41 +1752,55 @@ xfs_dialloc( } /* + * If we are near to ENOSPC, we want to prefer allocation from AGs that + * have free inodes in them rather than use up free space allocating new + * inode chunks. Hence we turn off allocation for the first non-blocking + * pass through the AGs if we are near ENOSPC to consume free inodes + * that we can immediately allocate, but then we allow allocation on the + * second pass if we fail to find an AG with free inodes in it. + */ + if (percpu_counter_read_positive(&mp->m_fdblocks) < + mp->m_low_space[XFS_LOWSP_1_PCNT]) { + ok_alloc = false; + low_space = true; + } + + /* * Loop until we find an allocation group that either has free inodes * or in which we can allocate some inodes. Iterate through the * allocation groups upward, wrapping at the end. */ - agno = start_agno; flags = XFS_ALLOC_FLAG_TRYLOCK; - for (;;) { - pag = xfs_perag_get(mp, agno); - if (xfs_dialloc_good_ag(*tpp, pag, mode, flags, ok_alloc)) { - error = xfs_dialloc_try_ag(tpp, pag, parent, +retry: + for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) { + if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) { + error = xfs_dialloc_try_ag(pag, tpp, parent, &ino, ok_alloc); if (error != -EAGAIN) break; + error = 0; } if (xfs_is_shutdown(mp)) { error = -EFSCORRUPTED; break; } - if (++agno == mp->m_maxagi) - agno = 0; - if (agno == start_agno) { - if (!flags) { - error = -ENOSPC; - break; - } + } + if (pag) + xfs_perag_rele(pag); + if (error) + return error; + if (ino == NULLFSINO) { + if (flags) { flags = 0; + if (low_space) + ok_alloc = true; + goto retry; } - xfs_perag_put(pag); + return -ENOSPC; } - - if (!error) - *new_ino = ino; - xfs_perag_put(pag); - return error; + *new_ino = ino; + return 0; } /* @@ -1885,14 +1883,14 @@ next: STATIC int xfs_difree_inobt( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_agino_t agino, struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { + struct xfs_mount *mp = pag->pag_mount; struct xfs_agi *agi = agbp->b_addr; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; @@ -1907,7 +1905,7 @@ xfs_difree_inobt( /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); error = xfs_check_agi_freecount(cur); if (error) @@ -2019,20 +2017,20 @@ error0: */ STATIC int xfs_difree_finobt( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { + struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; int error; int i; - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) @@ -2179,7 +2177,7 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, pag, agino, xic, &rec); + error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec); if (error) goto error0; @@ -2187,7 +2185,7 @@ xfs_difree( * Fix up the free inode btree. */ if (xfs_has_finobt(mp)) { - error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec); + error = xfs_difree_finobt(pag, tp, agbp, agino, &rec); if (error) goto error0; } @@ -2200,15 +2198,15 @@ error0: STATIC int xfs_imap_lookup( - struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, xfs_agino_t agino, xfs_agblock_t agbno, xfs_agblock_t *chunk_agbno, xfs_agblock_t *offset_agbno, int flags) { + struct xfs_mount *mp = pag->pag_mount; struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; @@ -2229,7 +2227,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) @@ -2263,12 +2261,13 @@ xfs_imap_lookup( */ int xfs_imap( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ + struct xfs_perag *pag, + struct xfs_trans *tp, xfs_ino_t ino, /* inode to locate */ struct xfs_imap *imap, /* location map structure */ uint flags) /* flags for inode btree lookup */ { + struct xfs_mount *mp = pag->pag_mount; xfs_agblock_t agbno; /* block number of inode in the alloc group */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ @@ -2276,17 +2275,15 @@ xfs_imap( int error; /* error code */ int offset; /* index of inode in its buffer */ xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ - struct xfs_perag *pag; ASSERT(ino != NULLFSINO); /* * Split up the inode number into its parts. */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (!pag || agbno >= mp->m_sb.sb_agblocks || + if (agbno >= mp->m_sb.sb_agblocks || ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { error = -EINVAL; #ifdef DEBUG @@ -2295,20 +2292,14 @@ xfs_imap( * as they can be invalid without implying corruption. */ if (flags & XFS_IGET_UNTRUSTED) - goto out_drop; - if (!pag) { - xfs_alert(mp, - "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)", - __func__, XFS_INO_TO_AGNO(mp, ino), - mp->m_sb.sb_agcount); - } + return error; if (agbno >= mp->m_sb.sb_agblocks) { xfs_alert(mp, "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", __func__, (unsigned long long)agbno, (unsigned long)mp->m_sb.sb_agblocks); } - if (pag && ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { xfs_alert(mp, "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", __func__, ino, @@ -2316,7 +2307,7 @@ xfs_imap( } xfs_stack_trace(); #endif /* DEBUG */ - goto out_drop; + return error; } /* @@ -2327,10 +2318,10 @@ xfs_imap( * in all cases where an untrusted inode number is passed. */ if (flags & XFS_IGET_UNTRUSTED) { - error = xfs_imap_lookup(mp, tp, pag, agino, agbno, + error = xfs_imap_lookup(pag, tp, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) - goto out_drop; + return error; goto out_map; } @@ -2346,8 +2337,7 @@ xfs_imap( imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); - error = 0; - goto out_drop; + return 0; } /* @@ -2359,10 +2349,10 @@ xfs_imap( offset_agbno = agbno & M_IGEO(mp)->inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { - error = xfs_imap_lookup(mp, tp, pag, agino, agbno, + error = xfs_imap_lookup(pag, tp, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) - goto out_drop; + return error; } out_map: @@ -2390,14 +2380,9 @@ out_map: __func__, (unsigned long long) imap->im_blkno, (unsigned long long) imap->im_len, XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); - error = -EINVAL; - goto out_drop; + return -EINVAL; } - error = 0; -out_drop: - if (pag) - xfs_perag_put(pag); - return error; + return 0; } /* @@ -2613,10 +2598,10 @@ xfs_ialloc_read_agi( return error; agi = agibp->b_addr; - if (!pag->pagi_init) { + if (!xfs_perag_initialised_agi(pag)) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); - pag->pagi_init = 1; + set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); } /* @@ -2924,26 +2909,24 @@ xfs_ialloc_calc_rootino( */ int xfs_ialloc_check_shrink( + struct xfs_perag *pag, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf *agibp, xfs_agblock_t new_length) { struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; - struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; - xfs_agino_t agino = XFS_AGB_TO_AGINO(mp, new_length); + xfs_agino_t agino; int has; int error; - if (!xfs_has_sparseinodes(mp)) + if (!xfs_has_sparseinodes(pag->pag_mount)) return 0; - pag = xfs_perag_get(mp, agno); - cur = xfs_inobt_init_cursor(mp, tp, agibp, pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agibp, XFS_BTNUM_INO); /* Look up the inobt record that would correspond to the new EOFS. */ + agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has); if (error || !has) goto out; @@ -2964,6 +2947,5 @@ xfs_ialloc_check_shrink( } out: xfs_btree_del_cursor(cur, error); - xfs_perag_put(pag); return error; } diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 9bbbca6ac4ed..ab8c30b4ec22 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -12,6 +12,7 @@ struct xfs_imap; struct xfs_mount; struct xfs_trans; struct xfs_btree_cur; +struct xfs_perag; /* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 @@ -47,7 +48,7 @@ int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag, */ int xfs_imap( - struct xfs_mount *mp, /* file system mount structure */ + struct xfs_perag *pag, struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t ino, /* inode to locate */ struct xfs_imap *imap, /* location map structure */ @@ -106,7 +107,7 @@ int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_setup_geometry(struct xfs_mount *mp); xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit); -int xfs_ialloc_check_shrink(struct xfs_trans *tp, xfs_agnumber_t agno, +int xfs_ialloc_check_shrink(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agibp, xfs_agblock_t new_length); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 8c83e265770c..9b28211d5a4c 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -36,8 +36,8 @@ STATIC struct xfs_btree_cur * xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { - return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum); + return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, + cur->bc_ag.agbp, cur->bc_btnum); } STATIC void @@ -103,15 +103,15 @@ __xfs_inobt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; + args.pag = cur->bc_ag.pag; args.oinfo = XFS_RMAP_OINFO_INOBT; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.pag->pag_agno, sbno); args.minlen = 1; args.maxlen = 1; args.prod = 1; - args.type = XFS_ALLOCTYPE_NEAR_BNO; args.resv = resv; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_near_bno(&args, + XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, sbno)); if (error) return error; @@ -291,8 +291,8 @@ xfs_inobt_verify( * Similarly, during log recovery we will have a perag structure * attached, but the agi information will not yet have been initialised * from the on disk AGI. We don't currently use any of this information, - * but beware of the landmine (i.e. need to check pag->pagi_init) if we - * ever do. + * but beware of the landmine (i.e. need to check + * xfs_perag_initialised_agi(pag)) if we ever do. */ if (xfs_has_crc(mp)) { fa = xfs_btree_sblock_v5hdr_verify(bp); @@ -427,11 +427,11 @@ static const struct xfs_btree_ops xfs_finobt_ops = { */ static struct xfs_btree_cur * xfs_inobt_init_common( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ struct xfs_perag *pag, + struct xfs_trans *tp, /* transaction pointer */ xfs_btnum_t btnum) /* ialloc or free ino btree */ { + struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; cur = xfs_btree_alloc_cursor(mp, tp, btnum, @@ -456,16 +456,15 @@ xfs_inobt_init_common( /* Create an inode btree cursor. */ struct xfs_btree_cur * xfs_inobt_init_cursor( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; struct xfs_agi *agi = agbp->b_addr; - cur = xfs_inobt_init_common(mp, tp, pag, btnum); + cur = xfs_inobt_init_common(pag, tp, btnum); if (btnum == XFS_BTNUM_INO) cur->bc_nlevels = be32_to_cpu(agi->agi_level); else @@ -477,14 +476,13 @@ xfs_inobt_init_cursor( /* Create an inode btree cursor with a fake root for staging. */ struct xfs_btree_cur * xfs_inobt_stage_cursor( - struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag, + struct xbtree_afakeroot *afake, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - cur = xfs_inobt_init_common(mp, NULL, pag, btnum); + cur = xfs_inobt_init_common(pag, NULL, btnum); xfs_btree_stage_afakeroot(cur, afake); return cur; } @@ -708,9 +706,8 @@ xfs_inobt_max_size( /* Read AGI and create inobt cursor. */ int xfs_inobt_cur( - struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, xfs_btnum_t which, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp) @@ -725,16 +722,15 @@ xfs_inobt_cur( if (error) return error; - cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, pag, which); + cur = xfs_inobt_init_cursor(pag, tp, *agi_bpp, which); *curpp = cur; return 0; } static int xfs_inobt_count_blocks( - struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, xfs_btnum_t btnum, xfs_extlen_t *tree_blocks) { @@ -742,7 +738,7 @@ xfs_inobt_count_blocks( struct xfs_btree_cur *cur = NULL; int error; - error = xfs_inobt_cur(mp, tp, pag, btnum, &cur, &agbp); + error = xfs_inobt_cur(pag, tp, btnum, &cur, &agbp); if (error) return error; @@ -779,22 +775,21 @@ xfs_finobt_read_blocks( */ int xfs_finobt_calc_reserves( - struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, xfs_extlen_t *ask, xfs_extlen_t *used) { xfs_extlen_t tree_len = 0; int error; - if (!xfs_has_finobt(mp)) + if (!xfs_has_finobt(pag->pag_mount)) return 0; - if (xfs_has_inobtcounts(mp)) + if (xfs_has_inobtcounts(pag->pag_mount)) error = xfs_finobt_read_blocks(pag, tp, &tree_len); else - error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO, + error = xfs_inobt_count_blocks(pag, tp, XFS_BTNUM_FINO, &tree_len); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 26451cb76b98..e859a6e05230 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -46,12 +46,10 @@ struct xfs_perag; (maxrecs) * sizeof(xfs_inobt_key_t) + \ ((index) - 1) * sizeof(xfs_inobt_ptr_t))) -extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, xfs_btnum_t btnum); -struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag, - xfs_btnum_t btnum); +extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag, + struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum); +struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag, + struct xbtree_afakeroot *afake, xfs_btnum_t btnum); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ @@ -64,13 +62,13 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, #define xfs_inobt_rec_check_count(mp, rec) 0 #endif /* DEBUG */ -int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); +int xfs_finobt_calc_reserves(struct xfs_perag *perag, struct xfs_trans *tp, + xfs_extlen_t *ask, xfs_extlen_t *used); extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, unsigned long long len); -int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_perag *pag, xfs_btnum_t btnum, - struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp); +int xfs_inobt_cur(struct xfs_perag *pag, struct xfs_trans *tp, + xfs_btnum_t btnum, struct xfs_btree_cur **curpp, + struct xfs_buf **agi_bpp); void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 6f7ed9288fe4..bcf46aa0d08b 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1213,37 +1213,33 @@ out_error: STATIC int xfs_refcount_adjust( struct xfs_btree_cur *cur, - xfs_agblock_t agbno, - xfs_extlen_t aglen, - xfs_agblock_t *new_agbno, - xfs_extlen_t *new_aglen, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen, enum xfs_refc_adjust_op adj) { bool shape_changed; int shape_changes = 0; int error; - *new_agbno = agbno; - *new_aglen = aglen; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_increase(cur->bc_mp, + cur->bc_ag.pag->pag_agno, *agbno, *aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_decrease(cur->bc_mp, + cur->bc_ag.pag->pag_agno, *agbno, *aglen); /* * Ensure that no rcextents cross the boundary of the adjustment range. */ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, - agbno, &shape_changed); + *agbno, &shape_changed); if (error) goto out_error; if (shape_changed) shape_changes++; error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, - agbno + aglen, &shape_changed); + *agbno + *aglen, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1253,7 +1249,7 @@ xfs_refcount_adjust( * Try to merge with the left or right extents of the range. */ error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED, - new_agbno, new_aglen, adj, &shape_changed); + agbno, aglen, adj, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1262,7 +1258,7 @@ xfs_refcount_adjust( cur->bc_ag.refc.shape_changes++; /* Now that we've taken care of the ends, adjust the middle extents */ - error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj); + error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj); if (error) goto out_error; @@ -1298,21 +1294,20 @@ xfs_refcount_finish_one_cleanup( static inline int xfs_refcount_continue_op( struct xfs_btree_cur *cur, - xfs_fsblock_t startblock, - xfs_agblock_t new_agbno, - xfs_extlen_t new_len, - xfs_fsblock_t *new_fsbno) + struct xfs_refcount_intent *ri, + xfs_agblock_t new_agbno) { struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = cur->bc_ag.pag; - if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len))) + if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, + ri->ri_blockcount))) return -EFSCORRUPTED; - *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); - ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len)); - ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno)); + ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount)); + ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); return 0; } @@ -1327,11 +1322,7 @@ xfs_refcount_continue_op( int xfs_refcount_finish_one( struct xfs_trans *tp, - enum xfs_refcount_intent_type type, - xfs_fsblock_t startblock, - xfs_extlen_t blockcount, - xfs_fsblock_t *new_fsb, - xfs_extlen_t *new_len, + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; @@ -1339,17 +1330,16 @@ xfs_refcount_finish_one( struct xfs_buf *agbp = NULL; int error = 0; xfs_agblock_t bno; - xfs_agblock_t new_agbno; unsigned long nr_ops = 0; int shape_changes = 0; struct xfs_perag *pag; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); - bno = XFS_FSB_TO_AGBNO(mp, startblock); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); + bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock); - trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), - type, XFS_FSB_TO_AGBNO(mp, startblock), - blockcount); + trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock), + ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock), + ri->ri_blockcount); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) { error = -EIO; @@ -1380,42 +1370,42 @@ xfs_refcount_finish_one( } *pcur = rcur; - switch (type) { + switch (ri->ri_type) { case XFS_REFCOUNT_INCREASE: - error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, - new_len, XFS_REFCOUNT_ADJUST_INCREASE); + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_INCREASE); if (error) goto out_drop; - if (*new_len > 0) - error = xfs_refcount_continue_op(rcur, startblock, - new_agbno, *new_len, new_fsb); + if (ri->ri_blockcount > 0) + error = xfs_refcount_continue_op(rcur, ri, bno); break; case XFS_REFCOUNT_DECREASE: - error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, - new_len, XFS_REFCOUNT_ADJUST_DECREASE); + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_DECREASE); if (error) goto out_drop; - if (*new_len > 0) - error = xfs_refcount_continue_op(rcur, startblock, - new_agbno, *new_len, new_fsb); + if (ri->ri_blockcount > 0) + error = xfs_refcount_continue_op(rcur, ri, bno); break; case XFS_REFCOUNT_ALLOC_COW: - *new_fsb = startblock + blockcount; - *new_len = 0; - error = __xfs_refcount_cow_alloc(rcur, bno, blockcount); + error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); + if (error) + goto out_drop; + ri->ri_blockcount = 0; break; case XFS_REFCOUNT_FREE_COW: - *new_fsb = startblock + blockcount; - *new_len = 0; - error = __xfs_refcount_cow_free(rcur, bno, blockcount); + error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); + if (error) + goto out_drop; + ri->ri_blockcount = 0; break; default: ASSERT(0); error = -EFSCORRUPTED; } - if (!error && *new_len > 0) - trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type, - bno, blockcount, new_agbno, *new_len); + if (!error && ri->ri_blockcount > 0) + trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, + ri->ri_type, bno, ri->ri_blockcount); out_drop: xfs_perag_put(pag); return error; diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 452f30556f5a..c633477ce3ce 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -75,9 +75,7 @@ void xfs_refcount_decrease_extent(struct xfs_trans *tp, extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, struct xfs_btree_cur *rcur, int error); extern int xfs_refcount_finish_one(struct xfs_trans *tp, - enum xfs_refcount_intent_type type, xfs_fsblock_t startblock, - xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb, - xfs_extlen_t *new_len, struct xfs_btree_cur **pcur); + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index e1f789866683..f3b860970b26 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -67,14 +67,14 @@ xfs_refcountbt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, - xfs_refc_block(args.mp)); + args.pag = cur->bc_ag.pag; args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; args.resv = XFS_AG_RESV_METADATA; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_near_bno(&args, + XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, + xfs_refc_block(args.mp))); if (error) goto out_error; trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, @@ -227,7 +227,7 @@ xfs_refcountbt_verify( return fa; level = be16_to_cpu(block->bb_level); - if (pag && pag->pagf_init) { + if (pag && xfs_perag_initialised_agf(pag)) { if (level >= pag->pagf_refcount_level) return __this_address; } else if (level >= mp->m_refc_maxlevels) diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index b56aca1e7c66..df720041cd3d 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2390,13 +2390,7 @@ xfs_rmap_finish_one_cleanup( int xfs_rmap_finish_one( struct xfs_trans *tp, - enum xfs_rmap_intent_type type, - uint64_t owner, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t blockcount, - xfs_exntst_t state, + struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; @@ -2408,11 +2402,13 @@ xfs_rmap_finish_one( xfs_agblock_t bno; bool unwritten; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); - bno = XFS_FSB_TO_AGBNO(mp, startblock); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock)); + bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock); - trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork, - startoff, blockcount, state); + trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno, + ri->ri_owner, ri->ri_whichfork, + ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount, + ri->ri_bmap.br_state); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) { error = -EIO; @@ -2448,35 +2444,37 @@ xfs_rmap_finish_one( } *pcur = rcur; - xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff); - unwritten = state == XFS_EXT_UNWRITTEN; - bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock); + xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork, + ri->ri_bmap.br_startoff); + unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN; + bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock); - switch (type) { + switch (ri->ri_type) { case XFS_RMAP_ALLOC: case XFS_RMAP_MAP: - error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo); + error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount, + unwritten, &oinfo); break; case XFS_RMAP_MAP_SHARED: - error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten, - &oinfo); + error = xfs_rmap_map_shared(rcur, bno, + ri->ri_bmap.br_blockcount, unwritten, &oinfo); break; case XFS_RMAP_FREE: case XFS_RMAP_UNMAP: - error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten, - &oinfo); + error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount, + unwritten, &oinfo); break; case XFS_RMAP_UNMAP_SHARED: - error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten, - &oinfo); + error = xfs_rmap_unmap_shared(rcur, bno, + ri->ri_bmap.br_blockcount, unwritten, &oinfo); break; case XFS_RMAP_CONVERT: - error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten, - &oinfo); + error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount, + !unwritten, &oinfo); break; case XFS_RMAP_CONVERT_SHARED: - error = xfs_rmap_convert_shared(rcur, bno, blockcount, - !unwritten, &oinfo); + error = xfs_rmap_convert_shared(rcur, bno, + ri->ri_bmap.br_blockcount, !unwritten, &oinfo); break; default: ASSERT(0); diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 54741a591a17..2dac88cea28d 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -179,10 +179,8 @@ void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, struct xfs_btree_cur *rcur, int error); -int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, - uint64_t owner, int whichfork, xfs_fileoff_t startoff, - xfs_fsblock_t startblock, xfs_filblks_t blockcount, - xfs_exntst_t state, struct xfs_btree_cur **pcur); +int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri, + struct xfs_btree_cur **pcur); int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 7f83f62e51e0..d3285684bb5e 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -313,7 +313,7 @@ xfs_rmapbt_verify( return fa; level = be16_to_cpu(block->bb_level); - if (pag && pag->pagf_init) { + if (pag && xfs_perag_initialised_agf(pag)) { if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) return __this_address; } else if (level >= mp->m_rmap_maxlevels) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 1eeecf2eb2a7..99cc03a298e2 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -909,7 +909,8 @@ xfs_sb_mount_common( struct xfs_mount *mp, struct xfs_sb *sbp) { - mp->m_agfrotor = mp->m_agirotor = 0; + mp->m_agfrotor = 0; + atomic_set(&mp->m_agirotor, 0); mp->m_maxagi = mp->m_sb.sb_agcount; mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index d75d82151eeb..c37e6d72760b 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -191,14 +191,15 @@ xrep_agf_init_header( struct xfs_agf *old_agf) { struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag = sc->sa.pag; struct xfs_agf *agf = agf_bp->b_addr; memcpy(old_agf, agf, sizeof(*old_agf)); memset(agf, 0, BBTOB(agf_bp->b_length)); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); - agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno); - agf->agf_length = cpu_to_be32(sc->sa.pag->block_count); + agf->agf_seqno = cpu_to_be32(pag->pag_agno); + agf->agf_length = cpu_to_be32(pag->block_count); agf->agf_flfirst = old_agf->agf_flfirst; agf->agf_fllast = old_agf->agf_fllast; agf->agf_flcount = old_agf->agf_flcount; @@ -206,8 +207,8 @@ xrep_agf_init_header( uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); /* Mark the incore AGF data stale until we're done fixing things. */ - ASSERT(sc->sa.pag->pagf_init); - sc->sa.pag->pagf_init = 0; + ASSERT(xfs_perag_initialised_agf(pag)); + clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); } /* Set btree root information in an AGF. */ @@ -333,7 +334,7 @@ xrep_agf_commit_new( pag->pagf_levels[XFS_BTNUM_RMAPi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); - pag->pagf_init = 1; + set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); return 0; } @@ -434,7 +435,7 @@ xrep_agf( out_revert: /* Mark the incore AGF state stale and revert the AGF. */ - sc->sa.pag->pagf_init = 0; + clear_bit(XFS_AGSTATE_AGF_INIT, &sc->sa.pag->pag_opstate); memcpy(agf, &old_agf, sizeof(old_agf)); return error; } @@ -618,7 +619,7 @@ xrep_agfl_update_agf( xfs_force_summary_recalc(sc->mp); /* Update the AGF counters. */ - if (sc->sa.pag->pagf_init) + if (xfs_perag_initialised_agf(sc->sa.pag)) sc->sa.pag->pagf_flcount = flcount; agf->agf_flfirst = cpu_to_be32(0); agf->agf_flcount = cpu_to_be32(flcount); @@ -822,14 +823,15 @@ xrep_agi_init_header( struct xfs_agi *old_agi) { struct xfs_agi *agi = agi_bp->b_addr; + struct xfs_perag *pag = sc->sa.pag; struct xfs_mount *mp = sc->mp; memcpy(old_agi, agi, sizeof(*old_agi)); memset(agi, 0, BBTOB(agi_bp->b_length)); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); - agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno); - agi->agi_length = cpu_to_be32(sc->sa.pag->block_count); + agi->agi_seqno = cpu_to_be32(pag->pag_agno); + agi->agi_length = cpu_to_be32(pag->block_count); agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_has_crc(mp)) @@ -840,8 +842,8 @@ xrep_agi_init_header( sizeof(agi->agi_unlinked)); /* Mark the incore AGF data stale until we're done fixing things. */ - ASSERT(sc->sa.pag->pagi_init); - sc->sa.pag->pagi_init = 0; + ASSERT(xfs_perag_initialised_agi(pag)); + clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); } /* Set btree root information in an AGI. */ @@ -873,8 +875,7 @@ xrep_agi_calc_from_btrees( xfs_agino_t freecount; int error; - cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, - sc->sa.pag, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO); error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; @@ -894,8 +895,8 @@ xrep_agi_calc_from_btrees( if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { xfs_agblock_t blocks; - cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, - sc->sa.pag, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, + XFS_BTNUM_FINO); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -929,7 +930,7 @@ xrep_agi_commit_new( pag = sc->sa.pag; pag->pagi_count = be32_to_cpu(agi->agi_count); pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); - pag->pagi_init = 1; + set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); return 0; } @@ -994,7 +995,7 @@ xrep_agi( out_revert: /* Mark the incore AGI state stale and revert the AGI. */ - sc->sa.pag->pagi_init = 0; + clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate); memcpy(agi, &old_agi, sizeof(old_agi)); return error; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index d50d0eab196a..dbbc7037074c 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -662,7 +662,7 @@ xchk_bmap_check_rmaps( error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { - xfs_perag_put(pag); + xfs_perag_rele(pag); return error; } } diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 613260b04a3d..848a8e32e56f 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -478,15 +478,15 @@ xchk_ag_btcur_init( /* Set up a inobt cursor for cross-referencing. */ if (sa->agi_bp && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { - sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, - sa->pag, XFS_BTNUM_INO); + sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, + XFS_BTNUM_INO); } /* Set up a finobt cursor for cross-referencing. */ if (sa->agi_bp && xfs_has_finobt(mp) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { - sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, - sa->pag, XFS_BTNUM_FINO); + sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, + XFS_BTNUM_FINO); } /* Set up a rmapbt cursor for cross-referencing. */ @@ -636,6 +636,7 @@ xchk_get_inode( { struct xfs_imap imap; struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag; struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); struct xfs_inode *ip = NULL; int error; @@ -671,10 +672,14 @@ xchk_get_inode( * Otherwise, we really couldn't find it so tell userspace * that it no longer exists. */ - error = xfs_imap(sc->mp, sc->tp, sc->sm->sm_ino, &imap, - XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE); - if (error) - return -ENOENT; + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); + if (pag) { + error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE); + xfs_perag_put(pag); + if (error) + return -ENOENT; + } error = -EFSCORRUPTED; fallthrough; default: diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 4777e7b89fdc..f0c7f41897b9 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -86,7 +86,8 @@ xchk_fscount_warmup( for_each_perag(mp, agno, pag) { if (xchk_should_terminate(sc, &error)) break; - if (pag->pagi_init && pag->pagf_init) + if (xfs_perag_initialised_agi(pag) && + xfs_perag_initialised_agf(pag)) continue; /* Lock both AG headers. */ @@ -101,7 +102,8 @@ xchk_fscount_warmup( * These are supposed to be initialized by the header read * function. */ - if (!pag->pagi_init || !pag->pagf_init) { + if (!xfs_perag_initialised_agi(pag) || + !xfs_perag_initialised_agf(pag)) { error = -EFSCORRUPTED; break; } @@ -117,7 +119,7 @@ xchk_fscount_warmup( if (agi_bp) xfs_buf_relse(agi_bp); if (pag) - xfs_perag_put(pag); + xfs_perag_rele(pag); return error; } @@ -220,7 +222,8 @@ retry: break; /* This somehow got unset since the warmup? */ - if (!pag->pagi_init || !pag->pagf_init) { + if (!xfs_perag_initialised_agi(pag) || + !xfs_perag_initialised_agf(pag)) { error = -EFSCORRUPTED; break; } @@ -249,7 +252,7 @@ retry: } if (pag) - xfs_perag_put(pag); + xfs_perag_rele(pag); if (error) { xchk_set_incomplete(sc); return error; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 4b92f9253ccd..1b71174ec0d6 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -206,7 +206,7 @@ xrep_calc_ag_resblks( return 0; pag = xfs_perag_get(mp, sm->sm_agno); - if (pag->pagi_init) { + if (xfs_perag_initialised_agi(pag)) { /* Use in-core icount if possible. */ icount = pag->pagi_count; } else { @@ -326,15 +326,14 @@ xrep_alloc_ag_block( args.tp = sc->tp; args.mp = sc->mp; + args.pag = sc->sa.pag; args.oinfo = *oinfo; - args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.pag->pag_agno, 0); args.minlen = 1; args.maxlen = 1; args.prod = 1; - args.type = XFS_ALLOCTYPE_THIS_AG; args.resv = resv; - error = xfs_alloc_vextent(&args); + error = xfs_alloc_vextent_this_ag(&args, sc->sa.pag->pag_agno); if (error) return error; if (args.fsbno == NULLFSBLOCK) diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index a05f44eb8178..791db7d9c849 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -242,7 +242,7 @@ xfs_acl_set_mode( } int -xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +xfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { umode_t mode; @@ -258,7 +258,7 @@ xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, return error; if (type == ACL_TYPE_ACCESS) { - error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); + error = posix_acl_update_mode(idmap, inode, &mode, &acl); if (error) return error; set_mode = true; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index dcd176149c7a..bf7f960997d3 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -11,7 +11,7 @@ struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu); -extern int xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, +extern int xfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 41323da523d1..6e2f0013380a 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -246,18 +246,11 @@ static int xfs_trans_log_finish_bmap_update( struct xfs_trans *tp, struct xfs_bud_log_item *budp, - enum xfs_bmap_intent_type type, - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t *blockcount, - xfs_exntst_t state) + struct xfs_bmap_intent *bi) { int error; - error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff, - startblock, blockcount, state); + error = xfs_bmap_finish_one(tp, bi); /* * Mark the transaction dirty, even on error. This ensures the @@ -290,24 +283,24 @@ xfs_bmap_update_diff_items( /* Set the map extent flags for this mapping. */ static void xfs_trans_set_bmap_flags( - struct xfs_map_extent *bmap, + struct xfs_map_extent *map, enum xfs_bmap_intent_type type, int whichfork, xfs_exntst_t state) { - bmap->me_flags = 0; + map->me_flags = 0; switch (type) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: - bmap->me_flags = type; + map->me_flags = type; break; default: ASSERT(0); } if (state == XFS_EXT_UNWRITTEN) - bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; + map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; if (whichfork == XFS_ATTR_FORK) - bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; + map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; } /* Log bmap updates in the intent item. */ @@ -315,7 +308,7 @@ STATIC void xfs_bmap_update_log_item( struct xfs_trans *tp, struct xfs_bui_log_item *buip, - struct xfs_bmap_intent *bmap) + struct xfs_bmap_intent *bi) { uint next_extent; struct xfs_map_extent *map; @@ -331,12 +324,12 @@ xfs_bmap_update_log_item( next_extent = atomic_inc_return(&buip->bui_next_extent) - 1; ASSERT(next_extent < buip->bui_format.bui_nextents); map = &buip->bui_format.bui_extents[next_extent]; - map->me_owner = bmap->bi_owner->i_ino; - map->me_startblock = bmap->bi_bmap.br_startblock; - map->me_startoff = bmap->bi_bmap.br_startoff; - map->me_len = bmap->bi_bmap.br_blockcount; - xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork, - bmap->bi_bmap.br_state); + map->me_owner = bi->bi_owner->i_ino; + map->me_startblock = bi->bi_bmap.br_startblock; + map->me_startoff = bi->bi_bmap.br_startoff; + map->me_len = bi->bi_bmap.br_blockcount; + xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork, + bi->bi_bmap.br_state); } static struct xfs_log_item * @@ -348,15 +341,15 @@ xfs_bmap_update_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_bui_log_item *buip = xfs_bui_init(mp); - struct xfs_bmap_intent *bmap; + struct xfs_bmap_intent *bi; ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); xfs_trans_add_item(tp, &buip->bui_item); if (sort) list_sort(mp, items, xfs_bmap_update_diff_items); - list_for_each_entry(bmap, items, bi_list) - xfs_bmap_update_log_item(tp, buip, bmap); + list_for_each_entry(bi, items, bi_list) + xfs_bmap_update_log_item(tp, buip, bi); return &buip->bui_item; } @@ -378,25 +371,17 @@ xfs_bmap_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_bmap_intent *bmap; - xfs_filblks_t count; + struct xfs_bmap_intent *bi; int error; - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - count = bmap->bi_bmap.br_blockcount; - error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), - bmap->bi_type, - bmap->bi_owner, bmap->bi_whichfork, - bmap->bi_bmap.br_startoff, - bmap->bi_bmap.br_startblock, - &count, - bmap->bi_bmap.br_state); - if (!error && count > 0) { - ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); - bmap->bi_bmap.br_blockcount = count; + bi = container_of(item, struct xfs_bmap_intent, bi_list); + + error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi); + if (!error && bi->bi_bmap.br_blockcount > 0) { + ASSERT(bi->bi_type == XFS_BMAP_UNMAP); return -EAGAIN; } - kmem_cache_free(xfs_bmap_intent_cache, bmap); + kmem_cache_free(xfs_bmap_intent_cache, bi); return error; } @@ -413,10 +398,10 @@ STATIC void xfs_bmap_update_cancel_item( struct list_head *item) { - struct xfs_bmap_intent *bmap; + struct xfs_bmap_intent *bi; - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - kmem_cache_free(xfs_bmap_intent_cache, bmap); + bi = container_of(item, struct xfs_bmap_intent, bi_list); + kmem_cache_free(xfs_bmap_intent_cache, bi); } const struct xfs_defer_op_type xfs_bmap_update_defer_type = { @@ -434,18 +419,18 @@ xfs_bui_validate( struct xfs_mount *mp, struct xfs_bui_log_item *buip) { - struct xfs_map_extent *bmap; + struct xfs_map_extent *map; /* Only one mapping operation per BUI... */ if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) return false; - bmap = &buip->bui_format.bui_extents[0]; + map = &buip->bui_format.bui_extents[0]; - if (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS) + if (map->me_flags & ~XFS_BMAP_EXTENT_FLAGS) return false; - switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + switch (map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: break; @@ -453,13 +438,13 @@ xfs_bui_validate( return false; } - if (!xfs_verify_ino(mp, bmap->me_owner)) + if (!xfs_verify_ino(mp, map->me_owner)) return false; - if (!xfs_verify_fileext(mp, bmap->me_startoff, bmap->me_len)) + if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) return false; - return xfs_verify_fsbext(mp, bmap->me_startblock, bmap->me_len); + return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } /* @@ -471,17 +456,13 @@ xfs_bui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { - struct xfs_bmbt_irec irec; + struct xfs_bmap_intent fake = { }; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_map_extent *bmap; + struct xfs_map_extent *map; struct xfs_bud_log_item *budp; - xfs_filblks_t count; - xfs_exntst_t state; - unsigned int bui_type; - int whichfork; int iext_delta; int error = 0; @@ -491,14 +472,12 @@ xfs_bui_item_recover( return -EFSCORRUPTED; } - bmap = &buip->bui_format.bui_extents[0]; - state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + map = &buip->bui_format.bui_extents[0]; + fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; - bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - error = xlog_recover_iget(mp, bmap->me_owner, &ip); + error = xlog_recover_iget(mp, map->me_owner, &ip); if (error) return error; @@ -512,34 +491,34 @@ xfs_bui_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - if (bui_type == XFS_BMAP_MAP) + if (fake.bi_type == XFS_BMAP_MAP) iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; else iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta); + error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; - count = bmap->me_len; - error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, - whichfork, bmap->me_startoff, bmap->me_startblock, - &count, state); + fake.bi_owner = ip; + fake.bi_bmap.br_startblock = map->me_startblock; + fake.bi_bmap.br_startoff = map->me_startoff; + fake.bi_bmap.br_blockcount = map->me_len; + fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + + error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap, - sizeof(*bmap)); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, + sizeof(*map)); if (error) goto err_cancel; - if (count > 0) { - ASSERT(bui_type == XFS_BMAP_UNMAP); - irec.br_startblock = bmap->me_startblock; - irec.br_blockcount = count; - irec.br_startoff = bmap->me_startoff; - irec.br_state = state; - xfs_bmap_unmap_extent(tp, ip, &irec); + if (fake.bi_bmap.br_blockcount > 0) { + ASSERT(fake.bi_type == XFS_BMAP_UNMAP); + xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap); } /* @@ -579,18 +558,18 @@ xfs_bui_item_relog( { struct xfs_bud_log_item *budp; struct xfs_bui_log_item *buip; - struct xfs_map_extent *extp; + struct xfs_map_extent *map; unsigned int count; count = BUI_ITEM(intent)->bui_format.bui_nextents; - extp = BUI_ITEM(intent)->bui_format.bui_extents; + map = BUI_ITEM(intent)->bui_format.bui_extents; tp->t_flags |= XFS_TRANS_DIRTY; budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); buip = xfs_bui_init(tp->t_mountp); - memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp)); + memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); atomic_set(&buip->bui_next_extent, count); xfs_trans_add_item(tp, &buip->bui_item); set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 867645b74d88..a09dd2606479 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1410,7 +1410,7 @@ xfs_swap_extent_rmap( /* Unmap the old blocks in the source file. */ while (tirec.br_blockcount) { - ASSERT(tp->t_firstblock == NULLFSBLOCK); + ASSERT(tp->t_highest_agno == NULLAGNUMBER); trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec); /* Read extent from the source file */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index bfc829c07f03..afc4c78b9eed 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -21,23 +21,20 @@ STATIC int xfs_trim_extents( - struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_daddr_t start, xfs_daddr_t end, xfs_daddr_t minlen, uint64_t *blocks_trimmed) { + struct xfs_mount *mp = pag->pag_mount; struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; struct xfs_agf *agf; - struct xfs_perag *pag; int error; int i; - pag = xfs_perag_get(mp, agno); - /* * Force out the log. This means any transactions that might have freed * space before we take the AGF buffer lock are now on disk, and the @@ -47,7 +44,7 @@ xfs_trim_extents( error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); if (error) - goto out_put_perag; + return error; agf = agbp->b_addr; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); @@ -71,10 +68,10 @@ xfs_trim_extents( error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) - goto out_del_cursor; + break; if (XFS_IS_CORRUPT(mp, i != 1)) { error = -EFSCORRUPTED; - goto out_del_cursor; + break; } ASSERT(flen <= be32_to_cpu(agf->agf_longest)); @@ -83,15 +80,15 @@ xfs_trim_extents( * the format the range/len variables are supplied in by * userspace. */ - dbno = XFS_AGB_TO_DADDR(mp, agno, fbno); + dbno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, fbno); dlen = XFS_FSB_TO_BB(mp, flen); /* * Too small? Give up. */ if (dlen < minlen) { - trace_xfs_discard_toosmall(mp, agno, fbno, flen); - goto out_del_cursor; + trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); + break; } /* @@ -100,7 +97,7 @@ xfs_trim_extents( * down partially overlapping ranges for now. */ if (dbno + dlen < start || dbno > end) { - trace_xfs_discard_exclude(mp, agno, fbno, flen); + trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen); goto next_extent; } @@ -109,32 +106,30 @@ xfs_trim_extents( * discard and try again the next time. */ if (xfs_extent_busy_search(mp, pag, fbno, flen)) { - trace_xfs_discard_busy(mp, agno, fbno, flen); + trace_xfs_discard_busy(mp, pag->pag_agno, fbno, flen); goto next_extent; } - trace_xfs_discard_extent(mp, agno, fbno, flen); + trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen); error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); if (error) - goto out_del_cursor; + break; *blocks_trimmed += flen; next_extent: error = xfs_btree_decrement(cur, 0, &i); if (error) - goto out_del_cursor; + break; if (fatal_signal_pending(current)) { error = -ERESTARTSYS; - goto out_del_cursor; + break; } } out_del_cursor: xfs_btree_del_cursor(cur, error); xfs_buf_relse(agbp); -out_put_perag: - xfs_perag_put(pag); return error; } @@ -152,11 +147,12 @@ xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { + struct xfs_perag *pag; unsigned int granularity = bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); struct fstrim_range range; xfs_daddr_t start, end, minlen; - xfs_agnumber_t start_agno, end_agno, agno; + xfs_agnumber_t agno; uint64_t blocks_trimmed = 0; int error, last_error = 0; @@ -193,18 +189,18 @@ xfs_ioc_trim( end = start + BTOBBT(range.len) - 1; if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) - end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; - - start_agno = xfs_daddr_to_agno(mp, start); - end_agno = xfs_daddr_to_agno(mp, end); + end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1; - for (agno = start_agno; agno <= end_agno; agno++) { - error = xfs_trim_extents(mp, agno, start, end, minlen, + agno = xfs_daddr_to_agno(mp, start); + for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { + error = xfs_trim_extents(pag, start, end, minlen, &blocks_trimmed); if (error) { last_error = error; - if (error == -ERESTARTSYS) + if (error == -ERESTARTSYS) { + xfs_perag_rele(pag); break; + } } } diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ae082808cfed..b2cbbba3e15a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -228,7 +228,7 @@ static struct attribute *xfs_errortag_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_errortag); -static struct kobj_type xfs_errortag_ktype = { +static const struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_errortag_sysfs_ops, .default_groups = xfs_errortag_groups, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index dbe6c37dc697..0b9c5ba8a598 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -75,7 +75,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); /* * XFS panic tags -- allow a call to xfs_alert_tag() be turned into - * a panic by setting xfs_panic_mask in a sysctl. + * a panic by setting fs.xfs.panic_mask in a sysctl. */ #define XFS_NO_PTAG 0u #define XFS_PTAG_IFLUSH (1u << 0) @@ -88,6 +88,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define XFS_PTAG_FSBLOCK_ZERO (1u << 7) #define XFS_PTAG_VERIFIER_ERROR (1u << 8) +#define XFS_PTAG_MASK (XFS_PTAG_IFLUSH | \ + XFS_PTAG_LOGRES | \ + XFS_PTAG_AILDELETE | \ + XFS_PTAG_ERROR_REPORT | \ + XFS_PTAG_SHUTDOWN_CORRUPT | \ + XFS_PTAG_SHUTDOWN_IOERROR | \ + XFS_PTAG_SHUTDOWN_LOGERROR | \ + XFS_PTAG_FSBLOCK_ZERO | \ + XFS_PTAG_VERIFIER_ERROR) + #define XFS_PTAG_STRINGS \ { XFS_NO_PTAG, "none" }, \ { XFS_PTAG_IFLUSH, "iflush" }, \ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d5130d1fcfae..011b50469301 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -345,23 +345,30 @@ static int xfs_trans_free_extent( struct xfs_trans *tp, struct xfs_efd_log_item *efdp, - xfs_fsblock_t start_block, - xfs_extlen_t ext_len, - const struct xfs_owner_info *oinfo, - bool skip_discard) + struct xfs_extent_free_item *xefi) { + struct xfs_owner_info oinfo = { }; struct xfs_mount *mp = tp->t_mountp; struct xfs_extent *extp; uint next_extent; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, + xefi->xefi_startblock); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, - start_block); + xefi->xefi_startblock); int error; - trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); + oinfo.oi_owner = xefi->xefi_owner; + if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) + oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) + oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; + + trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, + xefi->xefi_blockcount); - error = __xfs_free_extent(tp, start_block, ext_len, - oinfo, XFS_AG_RESV_NONE, skip_discard); + error = __xfs_free_extent(tp, xefi->xefi_startblock, + xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, + xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: @@ -375,8 +382,8 @@ xfs_trans_free_extent( next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = start_block; - extp->ext_len = ext_len; + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; efdp->efd_next_extent++; return error; @@ -404,7 +411,7 @@ STATIC void xfs_extent_free_log_item( struct xfs_trans *tp, struct xfs_efi_log_item *efip, - struct xfs_extent_free_item *free) + struct xfs_extent_free_item *xefi) { uint next_extent; struct xfs_extent *extp; @@ -420,8 +427,8 @@ xfs_extent_free_log_item( next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; ASSERT(next_extent < efip->efi_format.efi_nextents); extp = &efip->efi_format.efi_extents[next_extent]; - extp->ext_start = free->xefi_startblock; - extp->ext_len = free->xefi_blockcount; + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; } static struct xfs_log_item * @@ -433,15 +440,15 @@ xfs_extent_free_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); - struct xfs_extent_free_item *free; + struct xfs_extent_free_item *xefi; ASSERT(count > 0); xfs_trans_add_item(tp, &efip->efi_item); if (sort) list_sort(mp, items, xfs_extent_free_diff_items); - list_for_each_entry(free, items, xefi_list) - xfs_extent_free_log_item(tp, efip, free); + list_for_each_entry(xefi, items, xefi_list) + xfs_extent_free_log_item(tp, efip, xefi); return &efip->efi_item; } @@ -463,21 +470,13 @@ xfs_extent_free_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_owner_info oinfo = { }; - struct xfs_extent_free_item *free; + struct xfs_extent_free_item *xefi; int error; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - oinfo.oi_owner = free->xefi_owner; - if (free->xefi_flags & XFS_EFI_ATTR_FORK) - oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; - if (free->xefi_flags & XFS_EFI_BMBT_BLOCK) - oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - error = xfs_trans_free_extent(tp, EFD_ITEM(done), - free->xefi_startblock, - free->xefi_blockcount, - &oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD); - kmem_cache_free(xfs_extfree_item_cache, free); + xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + + error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -494,10 +493,10 @@ STATIC void xfs_extent_free_cancel_item( struct list_head *item) { - struct xfs_extent_free_item *free; + struct xfs_extent_free_item *xefi; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - kmem_cache_free(xfs_extfree_item_cache, free); + xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + kmem_cache_free(xfs_extfree_item_cache, xefi); } const struct xfs_defer_op_type xfs_extent_free_defer_type = { @@ -523,7 +522,7 @@ xfs_agfl_free_finish_item( struct xfs_owner_info oinfo = { }; struct xfs_mount *mp = tp->t_mountp; struct xfs_efd_log_item *efdp = EFD_ITEM(done); - struct xfs_extent_free_item *free; + struct xfs_extent_free_item *xefi; struct xfs_extent *extp; struct xfs_buf *agbp; int error; @@ -532,13 +531,13 @@ xfs_agfl_free_finish_item( uint next_extent; struct xfs_perag *pag; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - ASSERT(free->xefi_blockcount == 1); - agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); - agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); - oinfo.oi_owner = free->xefi_owner; + xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + ASSERT(xefi->xefi_blockcount == 1); + agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); + oinfo.oi_owner = xefi->xefi_owner; - trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); + trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount); pag = xfs_perag_get(mp, agno); error = xfs_alloc_read_agf(pag, tp, 0, &agbp); @@ -559,11 +558,11 @@ xfs_agfl_free_finish_item( next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = free->xefi_startblock; - extp->ext_len = free->xefi_blockcount; + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; efdp->efd_next_extent++; - kmem_cache_free(xfs_extfree_item_cache, free); + kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -599,7 +598,6 @@ xfs_efi_item_recover( struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; - struct xfs_extent *extp; int i; int error = 0; @@ -624,10 +622,17 @@ xfs_efi_item_recover( efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); for (i = 0; i < efip->efi_format.efi_nextents; i++) { + struct xfs_extent_free_item fake = { + .xefi_owner = XFS_RMAP_OWN_UNKNOWN, + }; + struct xfs_extent *extp; + extp = &efip->efi_format.efi_extents[i]; - error = xfs_trans_free_extent(tp, efdp, extp->ext_start, - extp->ext_len, - &XFS_RMAP_OINFO_ANY_OWNER, false); + + fake.xefi_startblock = extp->ext_start; + fake.xefi_blockcount = extp->ext_len; + + error = xfs_trans_free_extent(tp, efdp, &fake); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, extp, sizeof(*extp)); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 595a5bcf46b9..705250f9f90a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1047,7 +1047,7 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; - error = xfs_vn_setattr_size(file_mnt_user_ns(file), + error = xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), &iattr); if (error) goto out_unlock; @@ -1429,7 +1429,7 @@ xfs_file_mmap( file_accessed(file); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(inode)) - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 34b21a29c39b..22c13933c8f8 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -12,6 +12,7 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_alloc.h" #include "xfs_mru_cache.h" #include "xfs_trace.h" @@ -22,7 +23,7 @@ struct xfs_fstrm_item { struct xfs_mru_cache_elem mru; - xfs_agnumber_t ag; /* AG in use for this directory */ + struct xfs_perag *pag; /* AG in use for this directory */ }; enum xfs_fstrm_alloc { @@ -30,117 +31,68 @@ enum xfs_fstrm_alloc { XFS_PICK_LOWSPACE = 2, }; -/* - * Allocation group filestream associations are tracked with per-ag atomic - * counters. These counters allow xfs_filestream_pick_ag() to tell whether a - * particular AG already has active filestreams associated with it. - */ -int -xfs_filestream_peek_ag( - xfs_mount_t *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ret; - - pag = xfs_perag_get(mp, agno); - ret = atomic_read(&pag->pagf_fstrms); - xfs_perag_put(pag); - return ret; -} - -static int -xfs_filestream_get_ag( - xfs_mount_t *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ret; - - pag = xfs_perag_get(mp, agno); - ret = atomic_inc_return(&pag->pagf_fstrms); - xfs_perag_put(pag); - return ret; -} - -static void -xfs_filestream_put_ag( - xfs_mount_t *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, agno); - atomic_dec(&pag->pagf_fstrms); - xfs_perag_put(pag); -} - static void xfs_fstrm_free_func( void *data, struct xfs_mru_cache_elem *mru) { - struct xfs_mount *mp = data; struct xfs_fstrm_item *item = container_of(mru, struct xfs_fstrm_item, mru); + struct xfs_perag *pag = item->pag; - xfs_filestream_put_ag(mp, item->ag); - trace_xfs_filestream_free(mp, mru->key, item->ag); + trace_xfs_filestream_free(pag, mru->key); + atomic_dec(&pag->pagf_fstrms); + xfs_perag_rele(pag); kmem_free(item); } /* - * Scan the AGs starting at startag looking for an AG that isn't in use and has - * at least minlen blocks free. + * Scan the AGs starting at start_agno looking for an AG that isn't in use and + * has at least minlen blocks free. If no AG is found to match the allocation + * requirements, pick the AG with the most free space in it. */ static int xfs_filestream_pick_ag( - struct xfs_inode *ip, - xfs_agnumber_t startag, - xfs_agnumber_t *agp, + struct xfs_alloc_arg *args, + xfs_ino_t pino, + xfs_agnumber_t start_agno, int flags, - xfs_extlen_t minlen) + xfs_extlen_t *longest) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_fstrm_item *item; + struct xfs_mount *mp = args->mp; struct xfs_perag *pag; - xfs_extlen_t longest, free = 0, minfree, maxfree = 0; - xfs_agnumber_t ag, max_ag = NULLAGNUMBER; - int err, trylock, nscan; - - ASSERT(S_ISDIR(VFS_I(ip)->i_mode)); + struct xfs_perag *max_pag = NULL; + xfs_extlen_t minlen = *longest; + xfs_extlen_t free = 0, minfree, maxfree = 0; + xfs_agnumber_t agno; + bool first_pass = true; + int err; /* 2% of an AG's blocks must be free for it to be chosen. */ minfree = mp->m_sb.sb_agblocks / 50; - ag = startag; - *agp = NULLAGNUMBER; - - /* For the first pass, don't sleep trying to init the per-AG. */ - trylock = XFS_ALLOC_FLAG_TRYLOCK; - - for (nscan = 0; 1; nscan++) { - trace_xfs_filestream_scan(mp, ip->i_ino, ag); - - pag = xfs_perag_get(mp, ag); - - if (!pag->pagf_init) { - err = xfs_alloc_read_agf(pag, NULL, trylock, NULL); - if (err) { - if (err != -EAGAIN) { - xfs_perag_put(pag); - return err; - } - /* Couldn't lock the AGF, skip this AG. */ - goto next_ag; - } +restart: + for_each_perag_wrap(mp, start_agno, agno, pag) { + trace_xfs_filestream_scan(pag, pino); + *longest = 0; + err = xfs_bmap_longest_free_extent(pag, NULL, longest); + if (err) { + xfs_perag_rele(pag); + if (err != -EAGAIN) + break; + /* Couldn't lock the AGF, skip this AG. */ + err = 0; + continue; } /* Keep track of the AG with the most free blocks. */ if (pag->pagf_freeblks > maxfree) { maxfree = pag->pagf_freeblks; - max_ag = ag; + if (max_pag) + xfs_perag_rele(max_pag); + atomic_inc(&pag->pag_active_ref); + max_pag = pag; } /* @@ -149,93 +101,73 @@ xfs_filestream_pick_ag( * loop, and it guards against two filestreams being established * in the same AG as each other. */ - if (xfs_filestream_get_ag(mp, ag) > 1) { - xfs_filestream_put_ag(mp, ag); - goto next_ag; - } - - longest = xfs_alloc_longest_free_extent(pag, - xfs_alloc_min_freelist(mp, pag), - xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); - if (((minlen && longest >= minlen) || - (!minlen && pag->pagf_freeblks >= minfree)) && - (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || - (flags & XFS_PICK_LOWSPACE))) { - - /* Break out, retaining the reference on the AG. */ - free = pag->pagf_freeblks; - xfs_perag_put(pag); - *agp = ag; - break; + if (atomic_inc_return(&pag->pagf_fstrms) <= 1) { + if (((minlen && *longest >= minlen) || + (!minlen && pag->pagf_freeblks >= minfree)) && + (!xfs_perag_prefers_metadata(pag) || + !(flags & XFS_PICK_USERDATA) || + (flags & XFS_PICK_LOWSPACE))) { + /* Break out, retaining the reference on the AG. */ + free = pag->pagf_freeblks; + break; + } } /* Drop the reference on this AG, it's not usable. */ - xfs_filestream_put_ag(mp, ag); -next_ag: - xfs_perag_put(pag); - /* Move to the next AG, wrapping to AG 0 if necessary. */ - if (++ag >= mp->m_sb.sb_agcount) - ag = 0; - - /* If a full pass of the AGs hasn't been done yet, continue. */ - if (ag != startag) - continue; + atomic_dec(&pag->pagf_fstrms); + } - /* Allow sleeping in xfs_alloc_read_agf() on the 2nd pass. */ - if (trylock != 0) { - trylock = 0; - continue; + if (err) { + xfs_perag_rele(pag); + if (max_pag) + xfs_perag_rele(max_pag); + return err; + } + + if (!pag) { + /* + * Allow a second pass to give xfs_bmap_longest_free_extent() + * another attempt at locking AGFs that it might have skipped + * over before we fail. + */ + if (first_pass) { + first_pass = false; + goto restart; } - /* Finally, if lowspace wasn't set, set it for the 3rd pass. */ + /* + * We must be low on data space, so run a final lowspace + * optimised selection pass if we haven't already. + */ if (!(flags & XFS_PICK_LOWSPACE)) { flags |= XFS_PICK_LOWSPACE; - continue; + goto restart; } /* - * Take the AG with the most free space, regardless of whether - * it's already in use by another filestream. + * No unassociated AGs are available, so select the AG with the + * most free space, regardless of whether it's already in use by + * another filestream. It none suit, just use whatever AG we can + * grab. */ - if (max_ag != NULLAGNUMBER) { - xfs_filestream_get_ag(mp, max_ag); + if (!max_pag) { + for_each_perag_wrap(args->mp, 0, start_agno, args->pag) + break; + atomic_inc(&args->pag->pagf_fstrms); + *longest = 0; + } else { + pag = max_pag; free = maxfree; - *agp = max_ag; - break; + atomic_inc(&pag->pagf_fstrms); } - - /* take AG 0 if none matched */ - trace_xfs_filestream_pick(ip, *agp, free, nscan); - *agp = 0; - return 0; - } - - trace_xfs_filestream_pick(ip, *agp, free, nscan); - - if (*agp == NULLAGNUMBER) - return 0; - - err = -ENOMEM; - item = kmem_alloc(sizeof(*item), KM_MAYFAIL); - if (!item) - goto out_put_ag; - - item->ag = *agp; - - err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); - if (err) { - if (err == -EEXIST) - err = 0; - goto out_free_item; + } else if (max_pag) { + xfs_perag_rele(max_pag); } + trace_xfs_filestream_pick(pag, pino, free); + args->pag = pag; return 0; -out_free_item: - kmem_free(item); -out_put_ag: - xfs_filestream_put_ag(mp, *agp); - return err; } static struct xfs_inode * @@ -263,104 +195,187 @@ out: } /* - * Find the right allocation group for a file, either by finding an - * existing file stream or creating a new one. + * Lookup the mru cache for an existing association. If one exists and we can + * use it, return with an active perag reference indicating that the allocation + * will proceed with that association. * - * Returns NULLAGNUMBER in case of an error. + * If we have no association, or we cannot use the current one and have to + * destroy it, return with longest = 0 to tell the caller to create a new + * association. */ -xfs_agnumber_t -xfs_filestream_lookup_ag( - struct xfs_inode *ip) +static int +xfs_filestream_lookup_association( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_ino_t pino, + xfs_extlen_t *longest) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_inode *pip = NULL; - xfs_agnumber_t startag, ag = NULLAGNUMBER; + struct xfs_mount *mp = args->mp; + struct xfs_perag *pag; struct xfs_mru_cache_elem *mru; + int error = 0; - ASSERT(S_ISREG(VFS_I(ip)->i_mode)); - - pip = xfs_filestream_get_parent(ip); - if (!pip) - return NULLAGNUMBER; + *longest = 0; + mru = xfs_mru_cache_lookup(mp->m_filestream, pino); + if (!mru) + return 0; + /* + * Grab the pag and take an extra active reference for the caller whilst + * the mru item cannot go away. This means we'll pin the perag with + * the reference we get here even if the filestreams association is torn + * down immediately after we mark the lookup as done. + */ + pag = container_of(mru, struct xfs_fstrm_item, mru)->pag; + atomic_inc(&pag->pag_active_ref); + xfs_mru_cache_done(mp->m_filestream); - mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); - if (mru) { - ag = container_of(mru, struct xfs_fstrm_item, mru)->ag; - xfs_mru_cache_done(mp->m_filestream); + trace_xfs_filestream_lookup(pag, ap->ip->i_ino); - trace_xfs_filestream_lookup(mp, ip->i_ino, ag); - goto out; - } + ap->blkno = XFS_AGB_TO_FSB(args->mp, pag->pag_agno, 0); + xfs_bmap_adjacent(ap); /* - * Set the starting AG using the rotor for inode32, otherwise - * use the directory inode's AG. + * If there is very little free space before we start a filestreams + * allocation, we're almost guaranteed to fail to find a large enough + * free space available so just use the cached AG. */ - if (xfs_is_inode32(mp)) { - xfs_agnumber_t rotorstep = xfs_rotorstep; - startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; - mp->m_agfrotor = (mp->m_agfrotor + 1) % - (mp->m_sb.sb_agcount * rotorstep); - } else - startag = XFS_INO_TO_AGNO(mp, pip->i_ino); + if (ap->tp->t_flags & XFS_TRANS_LOWMODE) { + *longest = 1; + goto out_done; + } - if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0)) - ag = NULLAGNUMBER; -out: - xfs_irele(pip); - return ag; + error = xfs_bmap_longest_free_extent(pag, args->tp, longest); + if (error == -EAGAIN) + error = 0; + if (error || *longest < args->maxlen) { + /* We aren't going to use this perag */ + *longest = 0; + xfs_perag_rele(pag); + return error; + } + +out_done: + args->pag = pag; + return 0; } -/* - * Pick a new allocation group for the current file and its file stream. - * - * This is called when the allocator can't find a suitable extent in the - * current AG, and we have to move the stream into a new AG with more space. - */ -int -xfs_filestream_new_ag( +static int +xfs_filestream_create_association( struct xfs_bmalloca *ap, - xfs_agnumber_t *agp) + struct xfs_alloc_arg *args, + xfs_ino_t pino, + xfs_extlen_t *longest) { - struct xfs_inode *ip = ap->ip, *pip; - struct xfs_mount *mp = ip->i_mount; - xfs_extlen_t minlen = ap->length; - xfs_agnumber_t startag = 0; - int flags = 0; - int err = 0; + struct xfs_mount *mp = args->mp; struct xfs_mru_cache_elem *mru; + struct xfs_fstrm_item *item; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, pino); + int flags = 0; + int error; - *agp = NULLAGNUMBER; - - pip = xfs_filestream_get_parent(ip); - if (!pip) - goto exit; - - mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino); + /* Changing parent AG association now, so remove the existing one. */ + mru = xfs_mru_cache_remove(mp->m_filestream, pino); if (mru) { struct xfs_fstrm_item *item = container_of(mru, struct xfs_fstrm_item, mru); - startag = (item->ag + 1) % mp->m_sb.sb_agcount; + + agno = (item->pag->pag_agno + 1) % mp->m_sb.sb_agcount; + xfs_fstrm_free_func(mp, mru); + } else if (xfs_is_inode32(mp)) { + xfs_agnumber_t rotorstep = xfs_rotorstep; + + agno = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); } + ap->blkno = XFS_AGB_TO_FSB(args->mp, agno, 0); + xfs_bmap_adjacent(ap); + if (ap->datatype & XFS_ALLOC_USERDATA) flags |= XFS_PICK_USERDATA; if (ap->tp->t_flags & XFS_TRANS_LOWMODE) flags |= XFS_PICK_LOWSPACE; - err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); + *longest = ap->length; + error = xfs_filestream_pick_ag(args, pino, agno, flags, longest); + if (error) + return error; /* - * Only free the item here so we skip over the old AG earlier. + * We are going to use this perag now, so create an assoication for it. + * xfs_filestream_pick_ag() has already bumped the perag fstrms counter + * for us, so all we need to do here is take another active reference to + * the perag for the cached association. + * + * If we fail to store the association, we need to drop the fstrms + * counter as well as drop the perag reference we take here for the + * item. We do not need to return an error for this failure - as long as + * we return a referenced AG, the allocation can still go ahead just + * fine. */ - if (mru) - xfs_fstrm_free_func(mp, mru); + item = kmem_alloc(sizeof(*item), KM_MAYFAIL); + if (!item) + goto out_put_fstrms; + + atomic_inc(&args->pag->pag_active_ref); + item->pag = args->pag; + error = xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru); + if (error) + goto out_free_item; + return 0; + +out_free_item: + xfs_perag_rele(item->pag); + kmem_free(item); +out_put_fstrms: + atomic_dec(&args->pag->pagf_fstrms); + return 0; +} + +/* + * Search for an allocation group with a single extent large enough for + * the request. First we look for an existing association and use that if it + * is found. Otherwise, we create a new association by selecting an AG that fits + * the allocation criteria. + * + * We return with a referenced perag in args->pag to indicate which AG we are + * allocating into or an error with no references held. + */ +int +xfs_filestream_select_ag( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *longest) +{ + struct xfs_mount *mp = args->mp; + struct xfs_inode *pip; + xfs_ino_t ino = 0; + int error = 0; + + *longest = 0; + args->total = ap->total; + pip = xfs_filestream_get_parent(ap->ip); + if (pip) { + ino = pip->i_ino; + error = xfs_filestream_lookup_association(ap, args, ino, + longest); + xfs_irele(pip); + if (error) + return error; + if (*longest >= args->maxlen) + goto out_select; + if (ap->tp->t_flags & XFS_TRANS_LOWMODE) + goto out_select; + } + + error = xfs_filestream_create_association(ap, args, ino, longest); + if (error) + return error; - xfs_irele(pip); -exit: - if (*agp == NULLAGNUMBER) - *agp = 0; - return err; +out_select: + ap->blkno = XFS_AGB_TO_FSB(mp, args->pag->pag_agno, 0); + return 0; } void diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 403226ebb80b..84149ed0e340 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -9,13 +9,13 @@ struct xfs_mount; struct xfs_inode; struct xfs_bmalloca; +struct xfs_alloc_arg; int xfs_filestream_mount(struct xfs_mount *mp); void xfs_filestream_unmount(struct xfs_mount *mp); void xfs_filestream_deassociate(struct xfs_inode *ip); -xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); -int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); -int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno); +int xfs_filestream_select_ag(struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, xfs_extlen_t *blen); static inline int xfs_inode_is_filestream( diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 88a88506ffff..59e7d1a14b67 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -688,11 +688,11 @@ __xfs_getfsmap_datadev( info->agf_bp = NULL; } if (info->pag) { - xfs_perag_put(info->pag); + xfs_perag_rele(info->pag); info->pag = NULL; } else if (pag) { /* loop termination case */ - xfs_perag_put(pag); + xfs_perag_rele(pag); } return error; @@ -761,6 +761,7 @@ xfs_getfsmap_datadev_bnobt( { struct xfs_alloc_rec_incore akeys[2]; + memset(akeys, 0, sizeof(akeys)); info->missing_owner = XFS_FMR_OWN_UNKNOWN; return __xfs_getfsmap_datadev(tp, keys, info, xfs_getfsmap_datadev_bnobt_query, &akeys[0]); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 4d0a98f920ca..9edc1f2bc939 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -4,6 +4,7 @@ * All Rights Reserved. */ #include "xfs.h" +#include "xfs_error.h" /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, @@ -15,7 +16,7 @@ xfs_param_t xfs_params = { /* MIN DFLT MAX */ .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 256 }, + .panic_mask = { 0, 0, XFS_PTAG_MASK}, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 }, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index ddeaccc04aec..c9a7e270a428 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -255,7 +255,7 @@ xfs_perag_set_inode_tag( break; } - trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); + trace_xfs_perag_set_inode_tag(pag, _RET_IP_); } /* Clear a tag on both the AG incore inode tree and the AG radix tree. */ @@ -289,7 +289,7 @@ xfs_perag_clear_inode_tag( radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag); spin_unlock(&mp->m_perag_lock); - trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); + trace_xfs_perag_clear_inode_tag(pag, _RET_IP_); } /* @@ -586,7 +586,7 @@ xfs_iget_cache_miss( if (!ip) return -ENOMEM; - error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags); + error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags); if (error) goto out_destroy; @@ -1767,7 +1767,7 @@ xfs_icwalk( if (error) { last_error = error; if (error == -EFSCORRUPTED) { - xfs_perag_put(pag); + xfs_perag_rele(pag); break; } } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d354ea2b74f9..5808abab786c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -777,7 +777,7 @@ xfs_inode_inherit_flags2( */ int xfs_init_new_inode( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_trans *tp, struct xfs_inode *pip, xfs_ino_t ino, @@ -823,11 +823,11 @@ xfs_init_new_inode( ip->i_projid = prid; if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { - inode_fsuid_set(inode, mnt_userns); + inode_fsuid_set(inode, idmap); inode->i_gid = dir->i_gid; inode->i_mode = mode; } else { - inode_init_owner(mnt_userns, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); } /* @@ -836,7 +836,7 @@ xfs_init_new_inode( * (and only if the irix_sgid_inherit compatibility variable is set). */ if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && - !vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) + !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) inode->i_mode &= ~S_ISGID; ip->i_disk_size = 0; @@ -946,7 +946,7 @@ xfs_bumplink( int xfs_create( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, xfs_inode_t *dp, struct xfs_name *name, umode_t mode, @@ -978,8 +978,8 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), - mapped_fsgid(mnt_userns, &init_user_ns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), + mapped_fsgid(idmap, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1020,7 +1020,7 @@ xfs_create( */ error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); if (!error) - error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode, + error = xfs_init_new_inode(idmap, tp, dp, ino, mode, is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip); if (error) goto out_trans_cancel; @@ -1102,7 +1102,7 @@ xfs_create( int xfs_create_tmpfile( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_inode *dp, umode_t mode, struct xfs_inode **ipp) @@ -1127,8 +1127,8 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), - mapped_fsgid(mnt_userns, &init_user_ns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), + mapped_fsgid(idmap, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1144,7 +1144,7 @@ xfs_create_tmpfile( error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); if (!error) - error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode, + error = xfs_init_new_inode(idmap, tp, dp, ino, mode, 0, 0, prid, false, &ip); if (error) goto out_trans_cancel; @@ -1367,7 +1367,7 @@ xfs_itruncate_extents_flags( unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; while (unmap_len > 0) { - ASSERT(tp->t_firstblock == NULLFSBLOCK); + ASSERT(tp->t_highest_agno == NULLAGNUMBER); error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, flags, XFS_ITRUNC_MAX_EXTENTS); if (error) @@ -2709,7 +2709,7 @@ out_trans_abort: */ static int xfs_rename_alloc_whiteout( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_name *src_name, struct xfs_inode *dp, struct xfs_inode **wip) @@ -2718,7 +2718,7 @@ xfs_rename_alloc_whiteout( struct qstr name; int error; - error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE, + error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE, &tmpfile); if (error) return error; @@ -2750,7 +2750,7 @@ xfs_rename_alloc_whiteout( */ int xfs_rename( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, @@ -2782,7 +2782,7 @@ xfs_rename( * appropriately. */ if (flags & RENAME_WHITEOUT) { - error = xfs_rename_alloc_whiteout(mnt_userns, src_name, + error = xfs_rename_alloc_whiteout(idmap, src_name, target_dp, &wip); if (error) return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index fa780f08dc89..69d21e42c10a 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -473,18 +473,18 @@ int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); -int xfs_create(struct user_namespace *mnt_userns, +int xfs_create(struct mnt_idmap *idmap, struct xfs_inode *dp, struct xfs_name *name, umode_t mode, dev_t rdev, bool need_xattr, struct xfs_inode **ipp); -int xfs_create_tmpfile(struct user_namespace *mnt_userns, +int xfs_create_tmpfile(struct mnt_idmap *idmap, struct xfs_inode *dp, umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); -int xfs_rename(struct user_namespace *mnt_userns, +int xfs_rename(struct mnt_idmap *idmap, struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, struct xfs_name *target_name, @@ -515,7 +515,7 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); -int xfs_init_new_inode(struct user_namespace *mnt_userns, struct xfs_trans *tp, +int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp, struct xfs_inode *pip, xfs_ino_t ino, umode_t mode, xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs, struct xfs_inode **ipp); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 736510bc241b..55bb01173cde 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -665,7 +665,7 @@ xfs_ioc_fsbulkstat( struct xfs_fsop_bulkreq bulkreq; struct xfs_ibulk breq = { .mp = mp, - .mnt_userns = file_mnt_user_ns(file), + .idmap = file_mnt_idmap(file), .ocount = 0, }; xfs_ino_t lastino; @@ -844,7 +844,7 @@ xfs_ioc_bulkstat( struct xfs_bulk_ireq hdr; struct xfs_ibulk breq = { .mp = mp, - .mnt_userns = file_mnt_user_ns(file), + .idmap = file_mnt_idmap(file), }; int error; @@ -1297,7 +1297,7 @@ xfs_ioctl_setattr_check_projid( int xfs_fileattr_set( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { @@ -1371,7 +1371,7 @@ xfs_fileattr_set( */ if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) && - !capable_wrt_inode_uidgid(mnt_userns, VFS_I(ip), CAP_FSETID)) + !capable_wrt_inode_uidgid(idmap, VFS_I(ip), CAP_FSETID)) VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID); /* Change the ownerships and register project quota modifications */ diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index d4abba2c13c1..38be600b5e1e 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -49,7 +49,7 @@ xfs_fileattr_get( extern int xfs_fileattr_set( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 2f54b701eead..ee35eea1ecce 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -204,7 +204,7 @@ xfs_compat_ioc_fsbulkstat( struct xfs_fsop_bulkreq bulkreq; struct xfs_ibulk breq = { .mp = mp, - .mnt_userns = file_mnt_user_ns(file), + .idmap = file_mnt_idmap(file), .ocount = 0, }; xfs_ino_t lastino; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index fc1946f80a4a..69dbe7814128 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -83,7 +83,7 @@ xfs_iomap_valid( return true; } -static const struct iomap_page_ops xfs_iomap_page_ops = { +static const struct iomap_folio_ops xfs_iomap_folio_ops = { .iomap_valid = xfs_iomap_valid, }; @@ -133,7 +133,7 @@ xfs_bmbt_to_iomap( iomap->flags |= IOMAP_F_DIRTY; iomap->validity_cookie = sequence_cookie; - iomap->page_ops = &xfs_iomap_page_ops; + iomap->folio_ops = &xfs_iomap_folio_ops; return 0; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 515318dfbc38..24718adb3c16 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -162,12 +162,12 @@ xfs_create_need_xattr( STATIC int xfs_generic_create( - struct user_namespace *mnt_userns, - struct inode *dir, - struct dentry *dentry, - umode_t mode, - dev_t rdev, - struct file *tmpfile) /* unnamed file */ + struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev, + struct file *tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -196,11 +196,11 @@ xfs_generic_create( goto out_free_acl; if (!tmpfile) { - error = xfs_create(mnt_userns, XFS_I(dir), &name, mode, rdev, + error = xfs_create(idmap, XFS_I(dir), &name, mode, rdev, xfs_create_need_xattr(dir, default_acl, acl), &ip); } else { - error = xfs_create_tmpfile(mnt_userns, XFS_I(dir), mode, &ip); + error = xfs_create_tmpfile(idmap, XFS_I(dir), mode, &ip); } if (unlikely(error)) goto out_free_acl; @@ -255,35 +255,34 @@ xfs_generic_create( STATIC int xfs_vn_mknod( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, NULL); + return xfs_generic_create(idmap, dir, dentry, mode, rdev, NULL); } STATIC int xfs_vn_create( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool flags) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, NULL); + return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL); } STATIC int xfs_vn_mkdir( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0, - NULL); + return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL); } STATIC struct dentry * @@ -400,7 +399,7 @@ xfs_vn_unlink( STATIC int xfs_vn_symlink( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) @@ -417,7 +416,7 @@ xfs_vn_symlink( if (unlikely(error)) goto out; - error = xfs_symlink(mnt_userns, XFS_I(dir), &name, symname, mode, &cip); + error = xfs_symlink(idmap, XFS_I(dir), &name, symname, mode, &cip); if (unlikely(error)) goto out; @@ -443,7 +442,7 @@ xfs_vn_symlink( STATIC int xfs_vn_rename( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *odir, struct dentry *odentry, struct inode *ndir, @@ -472,7 +471,7 @@ xfs_vn_rename( if (unlikely(error)) return error; - return xfs_rename(mnt_userns, XFS_I(odir), &oname, + return xfs_rename(idmap, XFS_I(odir), &oname, XFS_I(d_inode(odentry)), XFS_I(ndir), &nname, new_inode ? XFS_I(new_inode) : NULL, flags); } @@ -549,7 +548,7 @@ xfs_stat_blksize( STATIC int xfs_vn_getattr( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, @@ -558,8 +557,8 @@ xfs_vn_getattr( struct inode *inode = d_inode(path->dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); trace_xfs_getattr(ip); @@ -627,7 +626,7 @@ xfs_vn_getattr( static int xfs_vn_change_ok( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { @@ -639,7 +638,7 @@ xfs_vn_change_ok( if (xfs_is_shutdown(mp)) return -EIO; - return setattr_prepare(mnt_userns, dentry, iattr); + return setattr_prepare(idmap, dentry, iattr); } /* @@ -650,7 +649,7 @@ xfs_vn_change_ok( */ static int xfs_setattr_nonsize( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct xfs_inode *ip, struct iattr *iattr) @@ -679,14 +678,14 @@ xfs_setattr_nonsize( uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = from_vfsuid(mnt_userns, i_user_ns(inode), + uid = from_vfsuid(idmap, i_user_ns(inode), iattr->ia_vfsuid); qflags |= XFS_QMOPT_UQUOTA; } else { uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = from_vfsgid(mnt_userns, i_user_ns(inode), + gid = from_vfsgid(idmap, i_user_ns(inode), iattr->ia_vfsgid); qflags |= XFS_QMOPT_GQUOTA; } else { @@ -719,18 +718,18 @@ xfs_setattr_nonsize( * also. */ if (XFS_IS_UQUOTA_ON(mp) && - i_uid_needs_update(mnt_userns, iattr, inode)) { + i_uid_needs_update(idmap, iattr, inode)) { ASSERT(udqp); old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } if (XFS_IS_GQUOTA_ON(mp) && - i_gid_needs_update(mnt_userns, iattr, inode)) { + i_gid_needs_update(idmap, iattr, inode)) { ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); ASSERT(gdqp); old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - setattr_copy(mnt_userns, inode, iattr); + setattr_copy(idmap, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -758,7 +757,7 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if (mask & ATTR_MODE) { - error = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); + error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (error) return error; } @@ -779,7 +778,7 @@ out_dqrele: */ STATIC int xfs_setattr_size( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct xfs_inode *ip, struct iattr *iattr) @@ -812,7 +811,7 @@ xfs_setattr_size( * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; - return xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr); + return xfs_setattr_nonsize(idmap, dentry, ip, iattr); } /* @@ -956,7 +955,7 @@ xfs_setattr_size( } ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); - setattr_copy(mnt_userns, inode, iattr); + setattr_copy(idmap, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -977,7 +976,7 @@ out_trans_cancel: int xfs_vn_setattr_size( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { @@ -986,15 +985,15 @@ xfs_vn_setattr_size( trace_xfs_setattr(ip); - error = xfs_vn_change_ok(mnt_userns, dentry, iattr); + error = xfs_vn_change_ok(idmap, dentry, iattr); if (error) return error; - return xfs_setattr_size(mnt_userns, dentry, ip, iattr); + return xfs_setattr_size(idmap, dentry, ip, iattr); } STATIC int xfs_vn_setattr( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { @@ -1014,14 +1013,14 @@ xfs_vn_setattr( return error; } - error = xfs_vn_setattr_size(mnt_userns, dentry, iattr); + error = xfs_vn_setattr_size(idmap, dentry, iattr); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { trace_xfs_setattr(ip); - error = xfs_vn_change_ok(mnt_userns, dentry, iattr); + error = xfs_vn_change_ok(idmap, dentry, iattr); if (!error) - error = xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr); + error = xfs_setattr_nonsize(idmap, dentry, ip, iattr); } return error; @@ -1092,12 +1091,12 @@ xfs_vn_fiemap( STATIC int xfs_vn_tmpfile( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { - int err = xfs_generic_create(mnt_userns, dir, file->f_path.dentry, mode, 0, file); + int err = xfs_generic_create(idmap, dir, file->f_path.dentry, mode, 0, file); return finish_open_simple(file, err); } diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index e570dcb5df8d..7f84a0843b24 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -13,7 +13,7 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -int xfs_vn_setattr_size(struct user_namespace *mnt_userns, +int xfs_vn_setattr_size(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *vap); int xfs_inode_init_security(struct inode *inode, struct inode *dir, diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index a1c2bcf65d37..f225413a993c 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -55,7 +55,7 @@ struct xfs_bstat_chunk { STATIC int xfs_bulkstat_one_int( struct xfs_mount *mp, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_trans *tp, xfs_ino_t ino, struct xfs_bstat_chunk *bc) @@ -83,8 +83,8 @@ xfs_bulkstat_one_int( ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); - vfsuid = i_uid_into_vfsuid(mnt_userns, inode); - vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(idmap, inode); + vfsgid = i_gid_into_vfsgid(idmap, inode); /* xfs_iget returns the following without needing * further change. @@ -178,7 +178,7 @@ xfs_bulkstat_one( struct xfs_trans *tp; int error; - if (breq->mnt_userns != &init_user_ns) { + if (breq->idmap != &nop_mnt_idmap) { xfs_warn_ratelimited(breq->mp, "bulkstat not supported inside of idmapped mounts."); return -EINVAL; @@ -199,7 +199,7 @@ xfs_bulkstat_one( if (error) goto out; - error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, tp, + error = xfs_bulkstat_one_int(breq->mp, breq->idmap, tp, breq->startino, &bc); xfs_trans_cancel(tp); out: @@ -225,7 +225,7 @@ xfs_bulkstat_iwalk( struct xfs_bstat_chunk *bc = data; int error; - error = xfs_bulkstat_one_int(mp, bc->breq->mnt_userns, tp, ino, data); + error = xfs_bulkstat_one_int(mp, bc->breq->idmap, tp, ino, data); /* bulkstat just skips over missing inodes */ if (error == -ENOENT || error == -EINVAL) return 0; @@ -270,7 +270,7 @@ xfs_bulkstat( unsigned int iwalk_flags = 0; int error; - if (breq->mnt_userns != &init_user_ns) { + if (breq->idmap != &nop_mnt_idmap) { xfs_warn_ratelimited(breq->mp, "bulkstat not supported inside of idmapped mounts."); return -EINVAL; diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index e2d0eba43f35..1659f13f17a8 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -8,7 +8,7 @@ /* In-memory representation of a userspace request for batch inode data. */ struct xfs_ibulk { struct xfs_mount *mp; - struct user_namespace *mnt_userns; + struct mnt_idmap *idmap; void __user *ubuffer; /* user output buffer */ xfs_ino_t startino; /* start with this inode */ unsigned int icount; /* number of elements in ubuffer */ diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 7558486f4937..21be93bf006d 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -275,7 +275,7 @@ xfs_iwalk_ag_start( /* Set up a fresh cursor and empty the inobt cache. */ iwag->nr_recs = 0; - error = xfs_inobt_cur(mp, tp, pag, XFS_BTNUM_INO, curpp, agi_bpp); + error = xfs_inobt_cur(pag, tp, XFS_BTNUM_INO, curpp, agi_bpp); if (error) return error; @@ -390,7 +390,7 @@ xfs_iwalk_run_callbacks( } /* ...and recreate the cursor just past where we left off. */ - error = xfs_inobt_cur(mp, iwag->tp, iwag->pag, XFS_BTNUM_INO, curpp, + error = xfs_inobt_cur(iwag->pag, iwag->tp, XFS_BTNUM_INO, curpp, agi_bpp); if (error) return error; @@ -591,7 +591,7 @@ xfs_iwalk( } if (iwag.pag) - xfs_perag_put(pag); + xfs_perag_rele(pag); xfs_iwalk_free(&iwag); return error; } @@ -683,7 +683,7 @@ xfs_iwalk_threaded( break; } if (pag) - xfs_perag_put(pag); + xfs_perag_rele(pag); if (polled) xfs_pwork_poll(&pctl); return xfs_pwork_destroy(&pctl); @@ -776,7 +776,7 @@ xfs_inobt_walk( } if (iwag.pag) - xfs_perag_put(pag); + xfs_perag_rele(pag); xfs_iwalk_free(&iwag); return error; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index f9878021e7d0..e88f18f85e4b 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -34,6 +34,7 @@ typedef __u32 xfs_nlink_t; #include <linux/module.h> #include <linux/mutex.h> #include <linux/file.h> +#include <linux/filelock.h> #include <linux/swap.h> #include <linux/errno.h> #include <linux/sched/signal.h> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8aca2cc173ac..f3269c0626f0 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -210,8 +210,7 @@ typedef struct xfs_mount { struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ xfs_agnumber_t m_agfrotor; /* last ag where space found */ - xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ - spinlock_t m_agirotor_lock;/* .. and lock protecting it */ + atomic_t m_agirotor; /* last ag dir inode alloced */ /* Memory shrinker to throttle and reprioritize inodegc */ struct shrinker m_inodegc_shrinker; diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 38d23f0e703a..23d16186e1a3 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -322,7 +322,7 @@ xfs_fs_commit_blocks( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); if (update_isize) { i_size_write(inode, iattr->ia_size); ip->i_disk_size = iattr->ia_size; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e2c542f6dcd4..7dc0db7f5a76 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -787,7 +787,7 @@ xfs_qm_qino_alloc( error = xfs_dialloc(&tp, 0, S_IFREG, &ino); if (!error) - error = xfs_init_new_inode(&init_user_ns, tp, NULL, ino, + error = xfs_init_new_inode(&nop_mnt_idmap, tp, NULL, ino, S_IFREG, 1, 0, 0, false, ipp); if (error) { xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 858e3e9eb4a8..48d771a76add 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -252,17 +252,12 @@ static int xfs_trans_log_finish_refcount_update( struct xfs_trans *tp, struct xfs_cud_log_item *cudp, - enum xfs_refcount_intent_type type, - xfs_fsblock_t startblock, - xfs_extlen_t blockcount, - xfs_fsblock_t *new_fsb, - xfs_extlen_t *new_len, + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur) { int error; - error = xfs_refcount_finish_one(tp, type, startblock, - blockcount, new_fsb, new_len, pcur); + error = xfs_refcount_finish_one(tp, ri, pcur); /* * Mark the transaction dirty, even on error. This ensures the @@ -297,16 +292,16 @@ xfs_refcount_update_diff_items( /* Set the phys extent flags for this reverse mapping. */ static void xfs_trans_set_refcount_flags( - struct xfs_phys_extent *refc, + struct xfs_phys_extent *pmap, enum xfs_refcount_intent_type type) { - refc->pe_flags = 0; + pmap->pe_flags = 0; switch (type) { case XFS_REFCOUNT_INCREASE: case XFS_REFCOUNT_DECREASE: case XFS_REFCOUNT_ALLOC_COW: case XFS_REFCOUNT_FREE_COW: - refc->pe_flags |= type; + pmap->pe_flags |= type; break; default: ASSERT(0); @@ -318,10 +313,10 @@ STATIC void xfs_refcount_update_log_item( struct xfs_trans *tp, struct xfs_cui_log_item *cuip, - struct xfs_refcount_intent *refc) + struct xfs_refcount_intent *ri) { uint next_extent; - struct xfs_phys_extent *ext; + struct xfs_phys_extent *pmap; tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); @@ -333,10 +328,10 @@ xfs_refcount_update_log_item( */ next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1; ASSERT(next_extent < cuip->cui_format.cui_nextents); - ext = &cuip->cui_format.cui_extents[next_extent]; - ext->pe_startblock = refc->ri_startblock; - ext->pe_len = refc->ri_blockcount; - xfs_trans_set_refcount_flags(ext, refc->ri_type); + pmap = &cuip->cui_format.cui_extents[next_extent]; + pmap->pe_startblock = ri->ri_startblock; + pmap->pe_len = ri->ri_blockcount; + xfs_trans_set_refcount_flags(pmap, ri->ri_type); } static struct xfs_log_item * @@ -348,15 +343,15 @@ xfs_refcount_update_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); - struct xfs_refcount_intent *refc; + struct xfs_refcount_intent *ri; ASSERT(count > 0); xfs_trans_add_item(tp, &cuip->cui_item); if (sort) list_sort(mp, items, xfs_refcount_update_diff_items); - list_for_each_entry(refc, items, ri_list) - xfs_refcount_update_log_item(tp, cuip, refc); + list_for_each_entry(ri, items, ri_list) + xfs_refcount_update_log_item(tp, cuip, ri); return &cuip->cui_item; } @@ -378,25 +373,20 @@ xfs_refcount_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_refcount_intent *refc; - xfs_fsblock_t new_fsb; - xfs_extlen_t new_aglen; + struct xfs_refcount_intent *ri; int error; - refc = container_of(item, struct xfs_refcount_intent, ri_list); - error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), - refc->ri_type, refc->ri_startblock, refc->ri_blockcount, - &new_fsb, &new_aglen, state); + ri = container_of(item, struct xfs_refcount_intent, ri_list); + error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri, + state); /* Did we run out of reservation? Requeue what we didn't finish. */ - if (!error && new_aglen > 0) { - ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || - refc->ri_type == XFS_REFCOUNT_DECREASE); - refc->ri_startblock = new_fsb; - refc->ri_blockcount = new_aglen; + if (!error && ri->ri_blockcount > 0) { + ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || + ri->ri_type == XFS_REFCOUNT_DECREASE); return -EAGAIN; } - kmem_cache_free(xfs_refcount_intent_cache, refc); + kmem_cache_free(xfs_refcount_intent_cache, ri); return error; } @@ -413,10 +403,10 @@ STATIC void xfs_refcount_update_cancel_item( struct list_head *item) { - struct xfs_refcount_intent *refc; + struct xfs_refcount_intent *ri; - refc = container_of(item, struct xfs_refcount_intent, ri_list); - kmem_cache_free(xfs_refcount_intent_cache, refc); + ri = container_of(item, struct xfs_refcount_intent, ri_list); + kmem_cache_free(xfs_refcount_intent_cache, ri); } const struct xfs_defer_op_type xfs_refcount_update_defer_type = { @@ -433,15 +423,15 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { static inline bool xfs_cui_validate_phys( struct xfs_mount *mp, - struct xfs_phys_extent *refc) + struct xfs_phys_extent *pmap) { if (!xfs_has_reflink(mp)) return false; - if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS) + if (pmap->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS) return false; - switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { + switch (pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { case XFS_REFCOUNT_INCREASE: case XFS_REFCOUNT_DECREASE: case XFS_REFCOUNT_ALLOC_COW: @@ -451,7 +441,7 @@ xfs_cui_validate_phys( return false; } - return xfs_verify_fsbext(mp, refc->pe_startblock, refc->pe_len); + return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); } /* @@ -463,18 +453,13 @@ xfs_cui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { - struct xfs_bmbt_irec irec; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); - struct xfs_phys_extent *refc; struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - xfs_fsblock_t new_fsb; - xfs_extlen_t new_len; unsigned int refc_type; bool requeue_only = false; - enum xfs_refcount_intent_type type; int i; int error = 0; @@ -513,14 +498,17 @@ xfs_cui_item_recover( cudp = xfs_trans_get_cud(tp, cuip); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { - refc = &cuip->cui_format.cui_extents[i]; - refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + struct xfs_refcount_intent fake = { }; + struct xfs_phys_extent *pmap; + + pmap = &cuip->cui_format.cui_extents[i]; + refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; switch (refc_type) { case XFS_REFCOUNT_INCREASE: case XFS_REFCOUNT_DECREASE: case XFS_REFCOUNT_ALLOC_COW: case XFS_REFCOUNT_FREE_COW: - type = refc_type; + fake.ri_type = refc_type; break; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -529,13 +517,12 @@ xfs_cui_item_recover( error = -EFSCORRUPTED; goto abort_error; } - if (requeue_only) { - new_fsb = refc->pe_startblock; - new_len = refc->pe_len; - } else + + fake.ri_startblock = pmap->pe_startblock; + fake.ri_blockcount = pmap->pe_len; + if (!requeue_only) error = xfs_trans_log_finish_refcount_update(tp, cudp, - type, refc->pe_startblock, refc->pe_len, - &new_fsb, &new_len, &rcur); + &fake, &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &cuip->cui_format, @@ -544,10 +531,13 @@ xfs_cui_item_recover( goto abort_error; /* Requeue what we didn't finish. */ - if (new_len > 0) { - irec.br_startblock = new_fsb; - irec.br_blockcount = new_len; - switch (type) { + if (fake.ri_blockcount > 0) { + struct xfs_bmbt_irec irec = { + .br_startblock = fake.ri_startblock, + .br_blockcount = fake.ri_blockcount, + }; + + switch (fake.ri_type) { case XFS_REFCOUNT_INCREASE: xfs_refcount_increase_extent(tp, &irec); break; @@ -596,18 +586,18 @@ xfs_cui_item_relog( { struct xfs_cud_log_item *cudp; struct xfs_cui_log_item *cuip; - struct xfs_phys_extent *extp; + struct xfs_phys_extent *pmap; unsigned int count; count = CUI_ITEM(intent)->cui_format.cui_nextents; - extp = CUI_ITEM(intent)->cui_format.cui_extents; + pmap = CUI_ITEM(intent)->cui_format.cui_extents; tp->t_flags |= XFS_TRANS_DIRTY; cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); cuip = xfs_cui_init(tp->t_mountp, count); - memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp)); + memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); xfs_trans_add_item(tp, &cuip->cui_item); set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 5535778a98f9..f5dc46ce9803 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -610,7 +610,7 @@ xfs_reflink_cancel_cow_blocks( if (error) break; } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { - ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); + ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); /* Free the CoW orphan record. */ xfs_refcount_free_cow_extent(*tpp, del.br_startblock, @@ -927,7 +927,7 @@ xfs_reflink_recover_cow( for_each_perag(mp, agno, pag) { error = xfs_refcount_recover_cow_leftovers(mp, pag); if (error) { - xfs_perag_put(pag); + xfs_perag_rele(pag); break; } } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 534504ede1a3..a1619d67015f 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -244,40 +244,40 @@ xfs_trans_get_rud( /* Set the map extent flags for this reverse mapping. */ static void xfs_trans_set_rmap_flags( - struct xfs_map_extent *rmap, + struct xfs_map_extent *map, enum xfs_rmap_intent_type type, int whichfork, xfs_exntst_t state) { - rmap->me_flags = 0; + map->me_flags = 0; if (state == XFS_EXT_UNWRITTEN) - rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; + map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; if (whichfork == XFS_ATTR_FORK) - rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; + map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; switch (type) { case XFS_RMAP_MAP: - rmap->me_flags |= XFS_RMAP_EXTENT_MAP; + map->me_flags |= XFS_RMAP_EXTENT_MAP; break; case XFS_RMAP_MAP_SHARED: - rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; + map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; break; case XFS_RMAP_UNMAP: - rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; + map->me_flags |= XFS_RMAP_EXTENT_UNMAP; break; case XFS_RMAP_UNMAP_SHARED: - rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; + map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; break; case XFS_RMAP_CONVERT: - rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; + map->me_flags |= XFS_RMAP_EXTENT_CONVERT; break; case XFS_RMAP_CONVERT_SHARED: - rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; + map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; break; case XFS_RMAP_ALLOC: - rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; + map->me_flags |= XFS_RMAP_EXTENT_ALLOC; break; case XFS_RMAP_FREE: - rmap->me_flags |= XFS_RMAP_EXTENT_FREE; + map->me_flags |= XFS_RMAP_EXTENT_FREE; break; default: ASSERT(0); @@ -293,19 +293,12 @@ static int xfs_trans_log_finish_rmap_update( struct xfs_trans *tp, struct xfs_rud_log_item *rudp, - enum xfs_rmap_intent_type type, - uint64_t owner, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t blockcount, - xfs_exntst_t state, + struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur) { int error; - error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff, - startblock, blockcount, state, pcur); + error = xfs_rmap_finish_one(tp, ri, pcur); /* * Mark the transaction dirty, even on error. This ensures the @@ -342,7 +335,7 @@ STATIC void xfs_rmap_update_log_item( struct xfs_trans *tp, struct xfs_rui_log_item *ruip, - struct xfs_rmap_intent *rmap) + struct xfs_rmap_intent *ri) { uint next_extent; struct xfs_map_extent *map; @@ -358,12 +351,12 @@ xfs_rmap_update_log_item( next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1; ASSERT(next_extent < ruip->rui_format.rui_nextents); map = &ruip->rui_format.rui_extents[next_extent]; - map->me_owner = rmap->ri_owner; - map->me_startblock = rmap->ri_bmap.br_startblock; - map->me_startoff = rmap->ri_bmap.br_startoff; - map->me_len = rmap->ri_bmap.br_blockcount; - xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork, - rmap->ri_bmap.br_state); + map->me_owner = ri->ri_owner; + map->me_startblock = ri->ri_bmap.br_startblock; + map->me_startoff = ri->ri_bmap.br_startoff; + map->me_len = ri->ri_bmap.br_blockcount; + xfs_trans_set_rmap_flags(map, ri->ri_type, ri->ri_whichfork, + ri->ri_bmap.br_state); } static struct xfs_log_item * @@ -375,15 +368,15 @@ xfs_rmap_update_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); - struct xfs_rmap_intent *rmap; + struct xfs_rmap_intent *ri; ASSERT(count > 0); xfs_trans_add_item(tp, &ruip->rui_item); if (sort) list_sort(mp, items, xfs_rmap_update_diff_items); - list_for_each_entry(rmap, items, ri_list) - xfs_rmap_update_log_item(tp, ruip, rmap); + list_for_each_entry(ri, items, ri_list) + xfs_rmap_update_log_item(tp, ruip, ri); return &ruip->rui_item; } @@ -405,16 +398,14 @@ xfs_rmap_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_rmap_intent *rmap; + struct xfs_rmap_intent *ri; int error; - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), - rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork, - rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock, - rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state, + ri = container_of(item, struct xfs_rmap_intent, ri_list); + + error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, state); - kmem_cache_free(xfs_rmap_intent_cache, rmap); + kmem_cache_free(xfs_rmap_intent_cache, ri); return error; } @@ -431,10 +422,10 @@ STATIC void xfs_rmap_update_cancel_item( struct list_head *item) { - struct xfs_rmap_intent *rmap; + struct xfs_rmap_intent *ri; - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - kmem_cache_free(xfs_rmap_intent_cache, rmap); + ri = container_of(item, struct xfs_rmap_intent, ri_list); + kmem_cache_free(xfs_rmap_intent_cache, ri); } const struct xfs_defer_op_type xfs_rmap_update_defer_type = { @@ -451,15 +442,15 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { static inline bool xfs_rui_validate_map( struct xfs_mount *mp, - struct xfs_map_extent *rmap) + struct xfs_map_extent *map) { if (!xfs_has_rmapbt(mp)) return false; - if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS) + if (map->me_flags & ~XFS_RMAP_EXTENT_FLAGS) return false; - switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: case XFS_RMAP_EXTENT_MAP_SHARED: case XFS_RMAP_EXTENT_UNMAP: @@ -473,14 +464,14 @@ xfs_rui_validate_map( return false; } - if (!XFS_RMAP_NON_INODE_OWNER(rmap->me_owner) && - !xfs_verify_ino(mp, rmap->me_owner)) + if (!XFS_RMAP_NON_INODE_OWNER(map->me_owner) && + !xfs_verify_ino(mp, map->me_owner)) return false; - if (!xfs_verify_fileext(mp, rmap->me_startoff, rmap->me_len)) + if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) return false; - return xfs_verify_fsbext(mp, rmap->me_startblock, rmap->me_len); + return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } /* @@ -493,15 +484,11 @@ xfs_rui_item_recover( struct list_head *capture_list) { struct xfs_rui_log_item *ruip = RUI_ITEM(lip); - struct xfs_map_extent *rmap; struct xfs_rud_log_item *rudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - enum xfs_rmap_intent_type type; - xfs_exntst_t state; int i; - int whichfork; int error = 0; /* @@ -526,35 +513,34 @@ xfs_rui_item_recover( rudp = xfs_trans_get_rud(tp, ruip); for (i = 0; i < ruip->rui_format.rui_nextents; i++) { - rmap = &ruip->rui_format.rui_extents[i]; - state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + struct xfs_rmap_intent fake = { }; + struct xfs_map_extent *map; + + map = &ruip->rui_format.rui_extents[i]; + switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: - type = XFS_RMAP_MAP; + fake.ri_type = XFS_RMAP_MAP; break; case XFS_RMAP_EXTENT_MAP_SHARED: - type = XFS_RMAP_MAP_SHARED; + fake.ri_type = XFS_RMAP_MAP_SHARED; break; case XFS_RMAP_EXTENT_UNMAP: - type = XFS_RMAP_UNMAP; + fake.ri_type = XFS_RMAP_UNMAP; break; case XFS_RMAP_EXTENT_UNMAP_SHARED: - type = XFS_RMAP_UNMAP_SHARED; + fake.ri_type = XFS_RMAP_UNMAP_SHARED; break; case XFS_RMAP_EXTENT_CONVERT: - type = XFS_RMAP_CONVERT; + fake.ri_type = XFS_RMAP_CONVERT; break; case XFS_RMAP_EXTENT_CONVERT_SHARED: - type = XFS_RMAP_CONVERT_SHARED; + fake.ri_type = XFS_RMAP_CONVERT_SHARED; break; case XFS_RMAP_EXTENT_ALLOC: - type = XFS_RMAP_ALLOC; + fake.ri_type = XFS_RMAP_ALLOC; break; case XFS_RMAP_EXTENT_FREE: - type = XFS_RMAP_FREE; + fake.ri_type = XFS_RMAP_FREE; break; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -563,13 +549,21 @@ xfs_rui_item_recover( error = -EFSCORRUPTED; goto abort_error; } - error = xfs_trans_log_finish_rmap_update(tp, rudp, type, - rmap->me_owner, whichfork, - rmap->me_startoff, rmap->me_startblock, - rmap->me_len, state, &rcur); + + fake.ri_owner = map->me_owner; + fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + fake.ri_bmap.br_startblock = map->me_startblock; + fake.ri_bmap.br_startoff = map->me_startoff; + fake.ri_bmap.br_blockcount = map->me_len; + fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + + error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, + &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - rmap, sizeof(*rmap)); + map, sizeof(*map)); if (error) goto abort_error; @@ -600,18 +594,18 @@ xfs_rui_item_relog( { struct xfs_rud_log_item *rudp; struct xfs_rui_log_item *ruip; - struct xfs_map_extent *extp; + struct xfs_map_extent *map; unsigned int count; count = RUI_ITEM(intent)->rui_format.rui_nextents; - extp = RUI_ITEM(intent)->rui_format.rui_extents; + map = RUI_ITEM(intent)->rui_format.rui_extents; tp->t_flags |= XFS_TRANS_DIRTY; rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); ruip = xfs_rui_init(tp->t_mountp, count); - memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp)); + memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); xfs_trans_add_item(tp, &ruip->rui_item); set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0c4b73e9b29d..2479b5cbd75e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -247,6 +247,32 @@ xfs_fs_show_options( return 0; } +static bool +xfs_set_inode_alloc_perag( + struct xfs_perag *pag, + xfs_ino_t ino, + xfs_agnumber_t max_metadata) +{ + if (!xfs_is_inode32(pag->pag_mount)) { + set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); + clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); + return false; + } + + if (ino > XFS_MAXINUMBER_32) { + clear_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); + clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); + return false; + } + + set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); + if (pag->pag_agno < max_metadata) + set_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); + else + clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); + return true; +} + /* * Set parameters for inode allocation heuristics, taking into account * filesystem size and inode32/inode64 mount options; i.e. specifically @@ -310,24 +336,8 @@ xfs_set_inode_alloc( ino = XFS_AGINO_TO_INO(mp, index, agino); pag = xfs_perag_get(mp, index); - - if (xfs_is_inode32(mp)) { - if (ino > XFS_MAXINUMBER_32) { - pag->pagi_inodeok = 0; - pag->pagf_metadata = 0; - } else { - pag->pagi_inodeok = 1; - maxagi++; - if (index < max_metadata) - pag->pagf_metadata = 1; - else - pag->pagf_metadata = 0; - } - } else { - pag->pagi_inodeok = 1; - pag->pagf_metadata = 0; - } - + if (xfs_set_inode_alloc_perag(pag, ino, max_metadata)) + maxagi++; xfs_perag_put(pag); } @@ -1922,7 +1932,6 @@ static int xfs_init_fs_context( return -ENOMEM; spin_lock_init(&mp->m_sb_lock); - spin_lock_init(&mp->m_agirotor_lock); INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); spin_lock_init(&mp->m_perag_lock); mutex_init(&mp->m_growlock); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 8389f3ef88ef..85e433df6a3f 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -144,7 +144,7 @@ xfs_readlink( int xfs_symlink( - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, @@ -193,8 +193,8 @@ xfs_symlink( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns), - mapped_fsgid(mnt_userns, &init_user_ns), prid, + error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), + mapped_fsgid(idmap, &init_user_ns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -231,7 +231,7 @@ xfs_symlink( */ error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino); if (!error) - error = xfs_init_new_inode(mnt_userns, tp, dp, ino, + error = xfs_init_new_inode(idmap, tp, dp, ino, S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, false, &ip); if (error) diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index 2586b7e393f3..d1ca1ce62a93 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -7,7 +7,7 @@ /* Kernel only symlink definitions */ -int xfs_symlink(struct user_namespace *mnt_userns, struct xfs_inode *dp, +int xfs_symlink(struct mnt_idmap *idmap, struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index f7faf6e70d7f..a3c6b1548723 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -69,7 +69,7 @@ static struct attribute *xfs_mp_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_mp); -struct kobj_type xfs_mp_ktype = { +const struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_mp_groups, @@ -266,7 +266,7 @@ static struct attribute *xfs_dbg_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_dbg); -struct kobj_type xfs_dbg_ktype = { +const struct kobj_type xfs_dbg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_dbg_groups, @@ -324,7 +324,7 @@ static struct attribute *xfs_stats_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_stats); -struct kobj_type xfs_stats_ktype = { +const struct kobj_type xfs_stats_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_stats_groups, @@ -410,7 +410,7 @@ static struct attribute *xfs_log_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_log); -struct kobj_type xfs_log_ktype = { +const struct kobj_type xfs_log_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_log_groups, @@ -564,13 +564,13 @@ static struct attribute *xfs_error_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_error); -static struct kobj_type xfs_error_cfg_ktype = { +static const struct kobj_type xfs_error_cfg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_error_groups, }; -static struct kobj_type xfs_error_ktype = { +static const struct kobj_type xfs_error_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, }; diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 513095e353a5..148893ebfdef 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -7,10 +7,10 @@ #ifndef __XFS_SYSFS_H__ #define __XFS_SYSFS_H__ -extern struct kobj_type xfs_mp_ktype; /* xfs_mount */ -extern struct kobj_type xfs_dbg_ktype; /* debug */ -extern struct kobj_type xfs_log_ktype; /* xlog */ -extern struct kobj_type xfs_stats_ktype; /* stats */ +extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */ +extern const struct kobj_type xfs_dbg_ktype; /* debug */ +extern const struct kobj_type xfs_log_ktype; /* xlog */ +extern const struct kobj_type xfs_stats_ktype; /* stats */ static inline struct xfs_kobj * to_kobj(struct kobject *kobject) @@ -28,7 +28,7 @@ xfs_sysfs_release(struct kobject *kobject) static inline int xfs_sysfs_init( struct xfs_kobj *kobj, - struct kobj_type *ktype, + const struct kobj_type *ktype, struct xfs_kobj *parent_kobj, const char *name) { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 421d1e504ac4..7dc0fd6a6504 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -74,6 +74,7 @@ struct xfs_inobt_rec_incore; union xfs_btree_ptr; struct xfs_dqtrx; struct xfs_icwalk; +struct xfs_perag; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -159,36 +160,40 @@ TRACE_EVENT(xlog_intent_recovery_failed, ); DECLARE_EVENT_CLASS(xfs_perag_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, - unsigned long caller_ip), - TP_ARGS(mp, agno, refcount, caller_ip), + TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), + TP_ARGS(pag, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(int, refcount) + __field(int, active_refcount) __field(unsigned long, caller_ip) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->refcount = refcount; + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->refcount = atomic_read(&pag->pag_ref); + __entry->active_refcount = atomic_read(&pag->pag_active_ref); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno 0x%x refcount %d caller %pS", + TP_printk("dev %d:%d agno 0x%x passive refs %d active refs %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->refcount, + __entry->active_refcount, (char *)__entry->caller_ip) ); #define DEFINE_PERAG_REF_EVENT(name) \ DEFINE_EVENT(xfs_perag_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \ - unsigned long caller_ip), \ - TP_ARGS(mp, agno, refcount, caller_ip)) + TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), \ + TP_ARGS(pag, caller_ip)) DEFINE_PERAG_REF_EVENT(xfs_perag_get); DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); +DEFINE_PERAG_REF_EVENT(xfs_perag_grab); +DEFINE_PERAG_REF_EVENT(xfs_perag_grab_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_rele); DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag); @@ -634,8 +639,8 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); DECLARE_EVENT_CLASS(xfs_filestream_class, - TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), - TP_ARGS(mp, ino, agno), + TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino), + TP_ARGS(pag, ino), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) @@ -643,10 +648,10 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, __field(int, streams) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; + __entry->dev = pag->pag_mount->m_super->s_dev; __entry->ino = ino; - __entry->agno = agno; - __entry->streams = xfs_filestream_peek_ag(mp, agno); + __entry->agno = pag->pag_agno; + __entry->streams = atomic_read(&pag->pagf_fstrms); ), TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -656,39 +661,40 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, ) #define DEFINE_FILESTREAM_EVENT(name) \ DEFINE_EVENT(xfs_filestream_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), \ - TP_ARGS(mp, ino, agno)) + TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino), \ + TP_ARGS(pag, ino)) DEFINE_FILESTREAM_EVENT(xfs_filestream_free); DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); TRACE_EVENT(xfs_filestream_pick, - TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno, - xfs_extlen_t free, int nscan), - TP_ARGS(ip, agno, free, nscan), + TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino, xfs_extlen_t free), + TP_ARGS(pag, ino, free), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_agnumber_t, agno) __field(int, streams) __field(xfs_extlen_t, free) - __field(int, nscan) ), TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->agno = agno; - __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->ino = ino; + if (pag) { + __entry->agno = pag->pag_agno; + __entry->streams = atomic_read(&pag->pagf_fstrms); + } else { + __entry->agno = NULLAGNUMBER; + __entry->streams = 0; + } __entry->free = free; - __entry->nscan = nscan; ), - TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d nscan %d", + TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->agno, __entry->streams, - __entry->free, - __entry->nscan) + __entry->free) ); DECLARE_EVENT_CLASS(xfs_lock_class, @@ -1795,13 +1801,11 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __field(xfs_extlen_t, alignment) __field(xfs_extlen_t, minalignslop) __field(xfs_extlen_t, len) - __field(short, type) - __field(short, otype) __field(char, wasdel) __field(char, wasfromfl) __field(int, resv) __field(int, datatype) - __field(xfs_fsblock_t, firstblock) + __field(xfs_agnumber_t, highest_agno) ), TP_fast_assign( __entry->dev = args->mp->m_super->s_dev; @@ -1816,18 +1820,16 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __entry->alignment = args->alignment; __entry->minalignslop = args->minalignslop; __entry->len = args->len; - __entry->type = args->type; - __entry->otype = args->otype; __entry->wasdel = args->wasdel; __entry->wasfromfl = args->wasfromfl; __entry->resv = args->resv; __entry->datatype = args->datatype; - __entry->firstblock = args->tp->t_firstblock; + __entry->highest_agno = args->tp->t_highest_agno; ), TP_printk("dev %d:%d agno 0x%x agbno 0x%x minlen %u maxlen %u mod %u " "prod %u minleft %u total %u alignment %u minalignslop %u " - "len %u type %s otype %s wasdel %d wasfromfl %d resv %d " - "datatype 0x%x firstblock 0x%llx", + "len %u wasdel %d wasfromfl %d resv %d " + "datatype 0x%x highest_agno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -1840,13 +1842,11 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __entry->alignment, __entry->minalignslop, __entry->len, - __print_symbolic(__entry->type, XFS_ALLOC_TYPES), - __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), __entry->wasdel, __entry->wasfromfl, __entry->resv, __entry->datatype, - (unsigned long long)__entry->firstblock) + __entry->highest_agno) ) #define DEFINE_ALLOC_EVENT(name) \ @@ -1877,6 +1877,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough); DEFINE_ALLOC_EVENT(xfs_alloc_small_done); DEFINE_ALLOC_EVENT(xfs_alloc_small_error); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_skip_deadlock); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); @@ -3207,17 +3208,14 @@ DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); TRACE_EVENT(xfs_refcount_finish_one_leftover, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int type, xfs_agblock_t agbno, xfs_extlen_t len, - xfs_agblock_t new_agbno, xfs_extlen_t new_len), - TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len), + int type, xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, type, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(int, type) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(xfs_agblock_t, new_agbno) - __field(xfs_extlen_t, new_len) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; @@ -3225,17 +3223,13 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover, __entry->type = type; __entry->agbno = agbno; __entry->len = len; - __entry->new_agbno = new_agbno; - __entry->new_len = new_len; ), - TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x", + TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->agno, __entry->agbno, - __entry->len, - __entry->new_agbno, - __entry->new_len) + __entry->len) ); /* simple inode-based error/%ip tracepoint class */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 7bd16fbff534..8afc0c080861 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -102,7 +102,7 @@ xfs_trans_dup( INIT_LIST_HEAD(&ntp->t_items); INIT_LIST_HEAD(&ntp->t_busy); INIT_LIST_HEAD(&ntp->t_dfops); - ntp->t_firstblock = NULLFSBLOCK; + ntp->t_highest_agno = NULLAGNUMBER; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(tp->t_ticket != NULL); @@ -278,7 +278,7 @@ retry: INIT_LIST_HEAD(&tp->t_items); INIT_LIST_HEAD(&tp->t_busy); INIT_LIST_HEAD(&tp->t_dfops); - tp->t_firstblock = NULLFSBLOCK; + tp->t_highest_agno = NULLAGNUMBER; error = xfs_trans_reserve(tp, resp, blocks, rtextents); if (error == -ENOSPC && want_retry) { @@ -1078,10 +1078,10 @@ xfs_trans_cancel( /* * It's never valid to cancel a transaction with deferred ops attached, * because the transaction is effectively dirty. Complain about this - * loudly before freeing the in-memory defer items. + * loudly before freeing the in-memory defer items and shutting down the + * filesystem. */ if (!list_empty(&tp->t_dfops)) { - ASSERT(xfs_is_shutdown(mp) || list_empty(&tp->t_dfops)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); dirty = true; xfs_defer_cancel(tp); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 55819785941c..6e3646d524ce 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -132,7 +132,7 @@ typedef struct xfs_trans { unsigned int t_rtx_res; /* # of rt extents resvd */ unsigned int t_rtx_res_used; /* # of resvd rt extents used */ unsigned int t_flags; /* misc flags */ - xfs_fsblock_t t_firstblock; /* first block allocated */ + xfs_agnumber_t t_highest_agno; /* highest AGF locked */ struct xlog_ticket *t_ticket; /* log mgr ticket */ struct xfs_mount *t_mountp; /* ptr to fs mount struct */ struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 10aa1fd39d2b..7b9a0ed1b11f 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -133,7 +133,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, static int xfs_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, struct dentry *unused, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile index 9fe54f5319f2..645f7229de4a 100644 --- a/fs/zonefs/Makefile +++ b/fs/zonefs/Makefile @@ -3,4 +3,4 @@ ccflags-y += -I$(src) obj-$(CONFIG_ZONEFS_FS) += zonefs.o -zonefs-y := super.o sysfs.o +zonefs-y := super.o file.o sysfs.o diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c new file mode 100644 index 000000000000..738b0e28d74b --- /dev/null +++ b/fs/zonefs/file.c @@ -0,0 +1,878 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Simple file system for zoned block devices exposing zones as files. + * + * Copyright (C) 2022 Western Digital Corporation or its affiliates. + */ +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/iomap.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/statfs.h> +#include <linux/writeback.h> +#include <linux/quotaops.h> +#include <linux/seq_file.h> +#include <linux/parser.h> +#include <linux/uio.h> +#include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/task_io_accounting_ops.h> + +#include "zonefs.h" + +#include "trace.h" + +static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + + /* + * All blocks are always mapped below EOF. If reading past EOF, + * act as if there is a hole up to the file maximum size. + */ + mutex_lock(&zi->i_truncate_mutex); + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); + isize = i_size_read(inode); + if (iomap->offset >= isize) { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + } else { + iomap->type = IOMAP_MAPPED; + iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; + iomap->length = isize - iomap->offset; + } + mutex_unlock(&zi->i_truncate_mutex); + + trace_zonefs_iomap_begin(inode, iomap); + + return 0; +} + +static const struct iomap_ops zonefs_read_iomap_ops = { + .iomap_begin = zonefs_read_iomap_begin, +}; + +static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + + /* All write I/Os should always be within the file maximum size */ + if (WARN_ON_ONCE(offset + length > z->z_capacity)) + return -EIO; + + /* + * Sequential zones can only accept direct writes. This is already + * checked when writes are issued, so warn if we see a page writeback + * operation. + */ + if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT))) + return -EIO; + + /* + * For conventional zones, all blocks are always mapped. For sequential + * zones, all blocks after always mapped below the inode size (zone + * write pointer) and unwriten beyond. + */ + mutex_lock(&zi->i_truncate_mutex); + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); + iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; + isize = i_size_read(inode); + if (iomap->offset >= isize) { + iomap->type = IOMAP_UNWRITTEN; + iomap->length = z->z_capacity - iomap->offset; + } else { + iomap->type = IOMAP_MAPPED; + iomap->length = isize - iomap->offset; + } + mutex_unlock(&zi->i_truncate_mutex); + + trace_zonefs_iomap_begin(inode, iomap); + + return 0; +} + +static const struct iomap_ops zonefs_write_iomap_ops = { + .iomap_begin = zonefs_write_iomap_begin, +}; + +static int zonefs_read_folio(struct file *unused, struct folio *folio) +{ + return iomap_read_folio(folio, &zonefs_read_iomap_ops); +} + +static void zonefs_readahead(struct readahead_control *rac) +{ + iomap_readahead(rac, &zonefs_read_iomap_ops); +} + +/* + * Map blocks for page writeback. This is used only on conventional zone files, + * which implies that the page range can only be within the fixed inode size. + */ +static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) +{ + struct zonefs_zone *z = zonefs_inode_zone(inode); + + if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) + return -EIO; + if (WARN_ON_ONCE(offset >= i_size_read(inode))) + return -EIO; + + /* If the mapping is already OK, nothing needs to be done */ + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + + return zonefs_write_iomap_begin(inode, offset, + z->z_capacity - offset, + IOMAP_WRITE, &wpc->iomap, NULL); +} + +static const struct iomap_writeback_ops zonefs_writeback_ops = { + .map_blocks = zonefs_write_map_blocks, +}; + +static int zonefs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct iomap_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); +} + +static int zonefs_swap_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) +{ + struct inode *inode = file_inode(swap_file); + + if (zonefs_inode_is_seq(inode)) { + zonefs_err(inode->i_sb, + "swap file: not a conventional zone file\n"); + return -EINVAL; + } + + return iomap_swapfile_activate(sis, swap_file, span, + &zonefs_read_iomap_ops); +} + +const struct address_space_operations zonefs_file_aops = { + .read_folio = zonefs_read_folio, + .readahead = zonefs_readahead, + .writepages = zonefs_writepages, + .dirty_folio = filemap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .migrate_folio = filemap_migrate_folio, + .is_partially_uptodate = iomap_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, + .direct_IO = noop_direct_IO, + .swap_activate = zonefs_swap_activate, +}; + +int zonefs_file_truncate(struct inode *inode, loff_t isize) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t old_isize; + enum req_op op; + int ret = 0; + + /* + * Only sequential zone files can be truncated and truncation is allowed + * only down to a 0 size, which is equivalent to a zone reset, and to + * the maximum file size, which is equivalent to a zone finish. + */ + if (!zonefs_zone_is_seq(z)) + return -EPERM; + + if (!isize) + op = REQ_OP_ZONE_RESET; + else if (isize == z->z_capacity) + op = REQ_OP_ZONE_FINISH; + else + return -EPERM; + + inode_dio_wait(inode); + + /* Serialize against page faults */ + filemap_invalidate_lock(inode->i_mapping); + + /* Serialize against zonefs_iomap_begin() */ + mutex_lock(&zi->i_truncate_mutex); + + old_isize = i_size_read(inode); + if (isize == old_isize) + goto unlock; + + ret = zonefs_inode_zone_mgmt(inode, op); + if (ret) + goto unlock; + + /* + * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, + * take care of open zones. + */ + if (z->z_flags & ZONEFS_ZONE_OPEN) { + /* + * Truncating a zone to EMPTY or FULL is the equivalent of + * closing the zone. For a truncation to 0, we need to + * re-open the zone to ensure new writes can be processed. + * For a truncation to the maximum file size, the zone is + * closed and writes cannot be accepted anymore, so clear + * the open flag. + */ + if (!isize) + ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + else + z->z_flags &= ~ZONEFS_ZONE_OPEN; + } + + zonefs_update_stats(inode, isize); + truncate_setsize(inode, isize); + z->z_wpoffset = isize; + zonefs_inode_account_active(inode); + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + filemap_invalidate_unlock(inode->i_mapping); + + return ret; +} + +static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file_inode(file); + int ret = 0; + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + /* + * Since only direct writes are allowed in sequential files, page cache + * flush is needed only for conventional zone files. + */ + if (zonefs_inode_is_cnv(inode)) + ret = file_write_and_wait_range(file, start, end); + if (!ret) + ret = blkdev_issue_flush(inode->i_sb->s_bdev); + + if (ret) + zonefs_io_error(inode, true); + + return ret; +} + +static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + vm_fault_t ret; + + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + + /* + * Sanity check: only conventional zone files can have shared + * writeable mappings. + */ + if (zonefs_inode_is_seq(inode)) + return VM_FAULT_NOPAGE; + + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + + /* Serialize against truncates */ + filemap_invalidate_lock_shared(inode->i_mapping); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); + filemap_invalidate_unlock_shared(inode->i_mapping); + + sb_end_pagefault(inode->i_sb); + return ret; +} + +static const struct vm_operations_struct zonefs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = zonefs_filemap_page_mkwrite, +}; + +static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* + * Conventional zones accept random writes, so their files can support + * shared writable mappings. For sequential zone files, only read + * mappings are possible since there are no guarantees for write + * ordering between msync() and page cache writeback. + */ + if (zonefs_inode_is_seq(file_inode(file)) && + (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + + file_accessed(file); + vma->vm_ops = &zonefs_file_vm_ops; + + return 0; +} + +static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) +{ + loff_t isize = i_size_read(file_inode(file)); + + /* + * Seeks are limited to below the zone size for conventional zones + * and below the zone write pointer for sequential zones. In both + * cases, this limit is the inode size. + */ + return generic_file_llseek_size(file, offset, whence, isize, isize); +} + +static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + if (error) { + zonefs_io_error(inode, true); + return error; + } + + if (size && zonefs_inode_is_seq(inode)) { + /* + * Note that we may be seeing completions out of order, + * but that is not a problem since a write completed + * successfully necessarily means that all preceding writes + * were also successful. So we can safely increase the inode + * size to the write end location. + */ + mutex_lock(&zi->i_truncate_mutex); + if (i_size_read(inode) < iocb->ki_pos + size) { + zonefs_update_stats(inode, iocb->ki_pos + size); + zonefs_i_size_write(inode, iocb->ki_pos + size); + } + mutex_unlock(&zi->i_truncate_mutex); + } + + return 0; +} + +static const struct iomap_dio_ops zonefs_write_dio_ops = { + .end_io = zonefs_file_write_dio_end_io, +}; + +static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_zone *z = zonefs_inode_zone(inode); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int max = bdev_max_zone_append_sectors(bdev); + struct bio *bio; + ssize_t size; + int nr_pages; + ssize_t ret; + + max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); + iov_iter_truncate(from, max); + + nr_pages = iov_iter_npages(from, BIO_MAX_VECS); + if (!nr_pages) + return 0; + + bio = bio_alloc(bdev, nr_pages, + REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); + bio->bi_iter.bi_sector = z->z_sector; + bio->bi_ioprio = iocb->ki_ioprio; + if (iocb_is_dsync(iocb)) + bio->bi_opf |= REQ_FUA; + + ret = bio_iov_iter_get_pages(bio, from); + if (unlikely(ret)) + goto out_release; + + size = bio->bi_iter.bi_size; + task_io_account_write(size); + + if (iocb->ki_flags & IOCB_HIPRI) + bio_set_polled(bio, iocb); + + ret = submit_bio_wait(bio); + + /* + * If the file zone was written underneath the file system, the zone + * write pointer may not be where we expect it to be, but the zone + * append write can still succeed. So check manually that we wrote where + * we intended to, that is, at zi->i_wpoffset. + */ + if (!ret) { + sector_t wpsector = + z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT); + + if (bio->bi_iter.bi_sector != wpsector) { + zonefs_warn(inode->i_sb, + "Corrupted write pointer %llu for zone at %llu\n", + wpsector, z->z_sector); + ret = -EIO; + } + } + + zonefs_file_write_dio_end_io(iocb, size, ret, 0); + trace_zonefs_file_dio_append(inode, size, ret); + +out_release: + bio_release_pages(bio, false); + bio_put(bio); + + if (ret >= 0) { + iocb->ki_pos += size; + return size; + } + + return ret; +} + +/* + * Do not exceed the LFS limits nor the file zone size. If pos is under the + * limit it becomes a short access. If it exceeds the limit, return -EFBIG. + */ +static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, + loff_t count) +{ + struct inode *inode = file_inode(file); + struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t limit = rlimit(RLIMIT_FSIZE); + loff_t max_size = z->z_capacity; + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + count = min(count, limit - pos); + } + + if (!(file->f_flags & O_LARGEFILE)) + max_size = min_t(loff_t, MAX_NON_LFS, max_size); + + if (unlikely(pos >= max_size)) + return -EFBIG; + + return min(count, max_size - pos); +} + +static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t count; + + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + if (!iov_iter_count(from)) + return 0; + + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + + if (iocb->ki_flags & IOCB_APPEND) { + if (zonefs_zone_is_cnv(z)) + return -EINVAL; + mutex_lock(&zi->i_truncate_mutex); + iocb->ki_pos = z->z_wpoffset; + mutex_unlock(&zi->i_truncate_mutex); + } + + count = zonefs_write_check_limits(file, iocb->ki_pos, + iov_iter_count(from)); + if (count < 0) + return count; + + iov_iter_truncate(from, count); + return iov_iter_count(from); +} + +/* + * Handle direct writes. For sequential zone files, this is the only possible + * write path. For these files, check that the user is issuing writes + * sequentially from the end of the file. This code assumes that the block layer + * delivers write requests to the device in sequential order. This is always the + * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE + * elevator feature is being used (e.g. mq-deadline). The block layer always + * automatically select such an elevator for zoned block devices during the + * device initialization. + */ +static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + bool sync = is_sync_kiocb(iocb); + bool append = false; + ssize_t ret, count; + + /* + * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT + * as this can cause write reordering (e.g. the first aio gets EAGAIN + * on the inode lock but the second goes through but is now unaligned). + */ + if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + count = zonefs_write_checks(iocb, from); + if (count <= 0) { + ret = count; + goto inode_unlock; + } + + if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { + ret = -EINVAL; + goto inode_unlock; + } + + /* Enforce sequential writes (append only) in sequential zones */ + if (zonefs_zone_is_seq(z)) { + mutex_lock(&zi->i_truncate_mutex); + if (iocb->ki_pos != z->z_wpoffset) { + mutex_unlock(&zi->i_truncate_mutex); + ret = -EINVAL; + goto inode_unlock; + } + mutex_unlock(&zi->i_truncate_mutex); + append = sync; + } + + if (append) + ret = zonefs_file_dio_append(iocb, from); + else + ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, + &zonefs_write_dio_ops, 0, NULL, 0); + if (zonefs_zone_is_seq(z) && + (ret > 0 || ret == -EIOCBQUEUED)) { + if (ret > 0) + count = ret; + + /* + * Update the zone write pointer offset assuming the write + * operation succeeded. If it did not, the error recovery path + * will correct it. Also do active seq file accounting. + */ + mutex_lock(&zi->i_truncate_mutex); + z->z_wpoffset += count; + zonefs_inode_account_active(inode); + mutex_unlock(&zi->i_truncate_mutex); + } + +inode_unlock: + inode_unlock(inode); + + return ret; +} + +static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, + struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + /* + * Direct IO writes are mandatory for sequential zone files so that the + * write IO issuing order is preserved. + */ + if (zonefs_inode_is_seq(inode)) + return -EIO; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = zonefs_write_checks(iocb, from); + if (ret <= 0) + goto inode_unlock; + + ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); + if (ret > 0) + iocb->ki_pos += ret; + else if (ret == -EIO) + zonefs_io_error(inode, true); + +inode_unlock: + inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + + return ret; +} + +static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_zone *z = zonefs_inode_zone(inode); + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (sb_rdonly(inode->i_sb)) + return -EROFS; + + /* Write operations beyond the zone capacity are not allowed */ + if (iocb->ki_pos >= z->z_capacity) + return -EFBIG; + + if (iocb->ki_flags & IOCB_DIRECT) { + ssize_t ret = zonefs_file_dio_write(iocb, from); + + if (ret != -ENOTBLK) + return ret; + } + + return zonefs_file_buffered_write(iocb, from); +} + +static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + if (error) { + zonefs_io_error(file_inode(iocb->ki_filp), false); + return error; + } + + return 0; +} + +static const struct iomap_dio_ops zonefs_read_dio_ops = { + .end_io = zonefs_file_read_dio_end_io, +}; + +static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + ssize_t ret; + + /* Offline zones cannot be read */ + if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) + return -EPERM; + + if (iocb->ki_pos >= z->z_capacity) + return 0; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + /* Limit read operations to written data */ + mutex_lock(&zi->i_truncate_mutex); + isize = i_size_read(inode); + if (iocb->ki_pos >= isize) { + mutex_unlock(&zi->i_truncate_mutex); + ret = 0; + goto inode_unlock; + } + iov_iter_truncate(to, isize - iocb->ki_pos); + mutex_unlock(&zi->i_truncate_mutex); + + if (iocb->ki_flags & IOCB_DIRECT) { + size_t count = iov_iter_count(to); + + if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { + ret = -EINVAL; + goto inode_unlock; + } + file_accessed(iocb->ki_filp); + ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, + &zonefs_read_dio_ops, 0, NULL, 0); + } else { + ret = generic_file_read_iter(iocb, to); + if (ret == -EIO) + zonefs_io_error(inode, false); + } + +inode_unlock: + inode_unlock_shared(inode); + + return ret; +} + +/* + * Write open accounting is done only for sequential files. + */ +static inline bool zonefs_seq_file_need_wro(struct inode *inode, + struct file *file) +{ + if (zonefs_inode_is_cnv(inode)) + return false; + + if (!(file->f_mode & FMODE_WRITE)) + return false; + + return true; +} + +static int zonefs_seq_file_write_open(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + + if (!zi->i_wr_refcnt) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); + + if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { + + if (sbi->s_max_wro_seq_files + && wro > sbi->s_max_wro_seq_files) { + atomic_dec(&sbi->s_wro_seq_files); + ret = -EBUSY; + goto unlock; + } + + if (i_size_read(inode) < z->z_capacity) { + ret = zonefs_inode_zone_mgmt(inode, + REQ_OP_ZONE_OPEN); + if (ret) { + atomic_dec(&sbi->s_wro_seq_files); + goto unlock; + } + z->z_flags |= ZONEFS_ZONE_OPEN; + zonefs_inode_account_active(inode); + } + } + } + + zi->i_wr_refcnt++; + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + + return ret; +} + +static int zonefs_file_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = generic_file_open(inode, file); + if (ret) + return ret; + + if (zonefs_seq_file_need_wro(inode, file)) + return zonefs_seq_file_write_open(inode); + + return 0; +} + +static void zonefs_seq_file_write_close(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + + zi->i_wr_refcnt--; + if (zi->i_wr_refcnt) + goto unlock; + + /* + * The file zone may not be open anymore (e.g. the file was truncated to + * its maximum size or it was fully written). For this case, we only + * need to decrement the write open count. + */ + if (z->z_flags & ZONEFS_ZONE_OPEN) { + ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + if (ret) { + __zonefs_io_error(inode, false); + /* + * Leaving zones explicitly open may lead to a state + * where most zones cannot be written (zone resources + * exhausted). So take preventive action by remounting + * read-only. + */ + if (z->z_flags & ZONEFS_ZONE_OPEN && + !(sb->s_flags & SB_RDONLY)) { + zonefs_warn(sb, + "closing zone at %llu failed %d\n", + z->z_sector, ret); + zonefs_warn(sb, + "remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; + } + goto unlock; + } + + z->z_flags &= ~ZONEFS_ZONE_OPEN; + zonefs_inode_account_active(inode); + } + + atomic_dec(&sbi->s_wro_seq_files); + +unlock: + mutex_unlock(&zi->i_truncate_mutex); +} + +static int zonefs_file_release(struct inode *inode, struct file *file) +{ + /* + * If we explicitly open a zone we must close it again as well, but the + * zone management operation can fail (either due to an IO error or as + * the zone has gone offline or read-only). Make sure we don't fail the + * close(2) for user-space. + */ + if (zonefs_seq_file_need_wro(inode, file)) + zonefs_seq_file_write_close(inode); + + return 0; +} + +const struct file_operations zonefs_file_operations = { + .open = zonefs_file_open, + .release = zonefs_file_release, + .fsync = zonefs_file_fsync, + .mmap = zonefs_file_mmap, + .llseek = zonefs_file_llseek, + .read_iter = zonefs_file_read_iter, + .write_iter = zonefs_file_write_iter, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .iopoll = iocb_bio_iopoll, +}; diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index a9c5c3f720ad..23b8b299c64e 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -28,33 +28,47 @@ #include "trace.h" /* - * Manage the active zone count. Called with zi->i_truncate_mutex held. + * Get the name of a zone group directory. */ -static void zonefs_account_active(struct inode *inode) +static const char *zonefs_zgroup_name(enum zonefs_ztype ztype) { - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - struct zonefs_inode_info *zi = ZONEFS_I(inode); + switch (ztype) { + case ZONEFS_ZTYPE_CNV: + return "cnv"; + case ZONEFS_ZTYPE_SEQ: + return "seq"; + default: + WARN_ON_ONCE(1); + return "???"; + } +} - lockdep_assert_held(&zi->i_truncate_mutex); +/* + * Manage the active zone count. + */ +static void zonefs_account_active(struct super_block *sb, + struct zonefs_zone *z) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + if (zonefs_zone_is_cnv(z)) return; /* * For zones that transitioned to the offline or readonly condition, * we only need to clear the active state. */ - if (zi->i_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) + if (z->z_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) goto out; /* * If the zone is active, that is, if it is explicitly open or * partially written, check if it was already accounted as active. */ - if ((zi->i_flags & ZONEFS_ZONE_OPEN) || - (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) { - if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) { - zi->i_flags |= ZONEFS_ZONE_ACTIVE; + if ((z->z_flags & ZONEFS_ZONE_OPEN) || + (z->z_wpoffset > 0 && z->z_wpoffset < z->z_capacity)) { + if (!(z->z_flags & ZONEFS_ZONE_ACTIVE)) { + z->z_flags |= ZONEFS_ZONE_ACTIVE; atomic_inc(&sbi->s_active_seq_files); } return; @@ -62,18 +76,29 @@ static void zonefs_account_active(struct inode *inode) out: /* The zone is not active. If it was, update the active count */ - if (zi->i_flags & ZONEFS_ZONE_ACTIVE) { - zi->i_flags &= ~ZONEFS_ZONE_ACTIVE; + if (z->z_flags & ZONEFS_ZONE_ACTIVE) { + z->z_flags &= ~ZONEFS_ZONE_ACTIVE; atomic_dec(&sbi->s_active_seq_files); } } -static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op) +/* + * Manage the active zone count. Called with zi->i_truncate_mutex held. + */ +void zonefs_inode_account_active(struct inode *inode) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - int ret; + lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); - lockdep_assert_held(&zi->i_truncate_mutex); + return zonefs_account_active(inode->i_sb, zonefs_inode_zone(inode)); +} + +/* + * Execute a zone management operation. + */ +static int zonefs_zone_mgmt(struct super_block *sb, + struct zonefs_zone *z, enum req_op op) +{ + int ret; /* * With ZNS drives, closing an explicitly open zone that has not been @@ -83,201 +108,49 @@ static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op) * are exceeded, make sure that the zone does not remain active by * resetting it. */ - if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset) + if (op == REQ_OP_ZONE_CLOSE && !z->z_wpoffset) op = REQ_OP_ZONE_RESET; - trace_zonefs_zone_mgmt(inode, op); - ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, - zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); + trace_zonefs_zone_mgmt(sb, z, op); + ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, + z->z_size >> SECTOR_SHIFT, GFP_NOFS); if (ret) { - zonefs_err(inode->i_sb, + zonefs_err(sb, "Zone management operation %s at %llu failed %d\n", - blk_op_str(op), zi->i_zsector, ret); + blk_op_str(op), z->z_sector, ret); return ret; } return 0; } -static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - i_size_write(inode, isize); - /* - * A full zone is no longer open/active and does not need - * explicit closing. - */ - if (isize >= zi->i_max_size) { - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - - if (zi->i_flags & ZONEFS_ZONE_ACTIVE) - atomic_dec(&sbi->s_active_seq_files); - zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); - } -} - -static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, - loff_t length, unsigned int flags, - struct iomap *iomap, struct iomap *srcmap) +int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - loff_t isize; + lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); - /* - * All blocks are always mapped below EOF. If reading past EOF, - * act as if there is a hole up to the file maximum size. - */ - mutex_lock(&zi->i_truncate_mutex); - iomap->bdev = inode->i_sb->s_bdev; - iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); - isize = i_size_read(inode); - if (iomap->offset >= isize) { - iomap->type = IOMAP_HOLE; - iomap->addr = IOMAP_NULL_ADDR; - iomap->length = length; - } else { - iomap->type = IOMAP_MAPPED; - iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; - iomap->length = isize - iomap->offset; - } - mutex_unlock(&zi->i_truncate_mutex); - - trace_zonefs_iomap_begin(inode, iomap); - - return 0; + return zonefs_zone_mgmt(inode->i_sb, zonefs_inode_zone(inode), op); } -static const struct iomap_ops zonefs_read_iomap_ops = { - .iomap_begin = zonefs_read_iomap_begin, -}; - -static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, - loff_t length, unsigned int flags, - struct iomap *iomap, struct iomap *srcmap) +void zonefs_i_size_write(struct inode *inode, loff_t isize) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - loff_t isize; + struct zonefs_zone *z = zonefs_inode_zone(inode); - /* All write I/Os should always be within the file maximum size */ - if (WARN_ON_ONCE(offset + length > zi->i_max_size)) - return -EIO; - - /* - * Sequential zones can only accept direct writes. This is already - * checked when writes are issued, so warn if we see a page writeback - * operation. - */ - if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && - !(flags & IOMAP_DIRECT))) - return -EIO; + i_size_write(inode, isize); /* - * For conventional zones, all blocks are always mapped. For sequential - * zones, all blocks after always mapped below the inode size (zone - * write pointer) and unwriten beyond. + * A full zone is no longer open/active and does not need + * explicit closing. */ - mutex_lock(&zi->i_truncate_mutex); - iomap->bdev = inode->i_sb->s_bdev; - iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); - iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; - isize = i_size_read(inode); - if (iomap->offset >= isize) { - iomap->type = IOMAP_UNWRITTEN; - iomap->length = zi->i_max_size - iomap->offset; - } else { - iomap->type = IOMAP_MAPPED; - iomap->length = isize - iomap->offset; - } - mutex_unlock(&zi->i_truncate_mutex); - - trace_zonefs_iomap_begin(inode, iomap); - - return 0; -} - -static const struct iomap_ops zonefs_write_iomap_ops = { - .iomap_begin = zonefs_write_iomap_begin, -}; - -static int zonefs_read_folio(struct file *unused, struct folio *folio) -{ - return iomap_read_folio(folio, &zonefs_read_iomap_ops); -} - -static void zonefs_readahead(struct readahead_control *rac) -{ - iomap_readahead(rac, &zonefs_read_iomap_ops); -} - -/* - * Map blocks for page writeback. This is used only on conventional zone files, - * which implies that the page range can only be within the fixed inode size. - */ -static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) - return -EIO; - if (WARN_ON_ONCE(offset >= i_size_read(inode))) - return -EIO; - - /* If the mapping is already OK, nothing needs to be done */ - if (offset >= wpc->iomap.offset && - offset < wpc->iomap.offset + wpc->iomap.length) - return 0; - - return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, - IOMAP_WRITE, &wpc->iomap, NULL); -} - -static const struct iomap_writeback_ops zonefs_writeback_ops = { - .map_blocks = zonefs_write_map_blocks, -}; - -static int zonefs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct iomap_writepage_ctx wpc = { }; - - return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); -} - -static int zonefs_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) -{ - struct inode *inode = file_inode(swap_file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); + if (isize >= z->z_capacity) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { - zonefs_err(inode->i_sb, - "swap file: not a conventional zone file\n"); - return -EINVAL; + if (z->z_flags & ZONEFS_ZONE_ACTIVE) + atomic_dec(&sbi->s_active_seq_files); + z->z_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); } - - return iomap_swapfile_activate(sis, swap_file, span, - &zonefs_read_iomap_ops); } -static const struct address_space_operations zonefs_file_aops = { - .read_folio = zonefs_read_folio, - .readahead = zonefs_readahead, - .writepages = zonefs_writepages, - .dirty_folio = filemap_dirty_folio, - .release_folio = iomap_release_folio, - .invalidate_folio = iomap_invalidate_folio, - .migrate_folio = filemap_migrate_folio, - .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, - .direct_IO = noop_direct_IO, - .swap_activate = zonefs_swap_activate, -}; - -static void zonefs_update_stats(struct inode *inode, loff_t new_isize) +void zonefs_update_stats(struct inode *inode, loff_t new_isize) { struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); @@ -310,63 +183,69 @@ static void zonefs_update_stats(struct inode *inode, loff_t new_isize) } /* - * Check a zone condition and adjust its file inode access permissions for - * offline and readonly zones. Return the inode size corresponding to the - * amount of readable data in the zone. + * Check a zone condition. Return the amount of written (and still readable) + * data in the zone. */ -static loff_t zonefs_check_zone_condition(struct inode *inode, - struct blk_zone *zone, bool warn, - bool mount) +static loff_t zonefs_check_zone_condition(struct super_block *sb, + struct zonefs_zone *z, + struct blk_zone *zone) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - switch (zone->cond) { case BLK_ZONE_COND_OFFLINE: - /* - * Dead zone: make the inode immutable, disable all accesses - * and set the file size to 0 (zone wp set to zone start). - */ - if (warn) - zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", - inode->i_ino); - inode->i_flags |= S_IMMUTABLE; - inode->i_mode &= ~0777; - zone->wp = zone->start; - zi->i_flags |= ZONEFS_ZONE_OFFLINE; + zonefs_warn(sb, "Zone %llu: offline zone\n", + z->z_sector); + z->z_flags |= ZONEFS_ZONE_OFFLINE; return 0; case BLK_ZONE_COND_READONLY: /* - * The write pointer of read-only zones is invalid. If such a - * zone is found during mount, the file size cannot be retrieved - * so we treat the zone as offline (mount == true case). - * Otherwise, keep the file size as it was when last updated - * so that the user can recover data. In both cases, writes are - * always disabled for the zone. + * The write pointer of read-only zones is invalid, so we cannot + * determine the zone wpoffset (inode size). We thus keep the + * zone wpoffset as is, which leads to an empty file + * (wpoffset == 0) on mount. For a runtime error, this keeps + * the inode size as it was when last updated so that the user + * can recover data. */ - if (warn) - zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", - inode->i_ino); - inode->i_flags |= S_IMMUTABLE; - if (mount) { - zone->cond = BLK_ZONE_COND_OFFLINE; - inode->i_mode &= ~0777; - zone->wp = zone->start; - zi->i_flags |= ZONEFS_ZONE_OFFLINE; - return 0; - } - zi->i_flags |= ZONEFS_ZONE_READONLY; - inode->i_mode &= ~0222; - return i_size_read(inode); + zonefs_warn(sb, "Zone %llu: read-only zone\n", + z->z_sector); + z->z_flags |= ZONEFS_ZONE_READONLY; + if (zonefs_zone_is_cnv(z)) + return z->z_capacity; + return z->z_wpoffset; case BLK_ZONE_COND_FULL: /* The write pointer of full zones is invalid. */ - return zi->i_max_size; + return z->z_capacity; default: - if (zi->i_ztype == ZONEFS_ZTYPE_CNV) - return zi->i_max_size; + if (zonefs_zone_is_cnv(z)) + return z->z_capacity; return (zone->wp - zone->start) << SECTOR_SHIFT; } } +/* + * Check a zone condition and adjust its inode access permissions for + * offline and readonly zones. + */ +static void zonefs_inode_update_mode(struct inode *inode) +{ + struct zonefs_zone *z = zonefs_inode_zone(inode); + + if (z->z_flags & ZONEFS_ZONE_OFFLINE) { + /* Offline zones cannot be read nor written */ + inode->i_flags |= S_IMMUTABLE; + inode->i_mode &= ~0777; + } else if (z->z_flags & ZONEFS_ZONE_READONLY) { + /* Readonly zones cannot be written */ + inode->i_flags |= S_IMMUTABLE; + if (z->z_flags & ZONEFS_ZONE_INIT_MODE) + inode->i_mode &= ~0777; + else + inode->i_mode &= ~0222; + } + + z->z_flags &= ~ZONEFS_ZONE_INIT_MODE; + z->z_mode = inode->i_mode; +} + struct zonefs_ioerr_data { struct inode *inode; bool write; @@ -377,7 +256,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, { struct zonefs_ioerr_data *err = data; struct inode *inode = err->inode; - struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); loff_t isize, data_size; @@ -388,10 +267,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * as there is no inconsistency between the inode size and the amount of * data writen in the zone (data_size). */ - data_size = zonefs_check_zone_condition(inode, zone, true, false); + data_size = zonefs_check_zone_condition(sb, z, zone); isize = i_size_read(inode); - if (zone->cond != BLK_ZONE_COND_OFFLINE && - zone->cond != BLK_ZONE_COND_READONLY && + if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && !err->write && isize == data_size) return 0; @@ -414,8 +292,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * In all cases, warn about inode size inconsistency and handle the * IO error according to the zone condition and to the mount options. */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size) - zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", + if (zonefs_zone_is_seq(z) && isize != data_size) + zonefs_warn(sb, + "inode %lu: invalid size %lld (should be %lld)\n", inode->i_ino, isize, data_size); /* @@ -424,24 +303,22 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * zone condition to read-only and offline respectively, as if the * condition was signaled by the hardware. */ - if (zone->cond == BLK_ZONE_COND_OFFLINE || - sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) { + if ((z->z_flags & ZONEFS_ZONE_OFFLINE) || + (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { zonefs_warn(sb, "inode %lu: read/write access disabled\n", inode->i_ino); - if (zone->cond != BLK_ZONE_COND_OFFLINE) { - zone->cond = BLK_ZONE_COND_OFFLINE; - data_size = zonefs_check_zone_condition(inode, zone, - false, false); - } - } else if (zone->cond == BLK_ZONE_COND_READONLY || - sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) { + if (!(z->z_flags & ZONEFS_ZONE_OFFLINE)) + z->z_flags |= ZONEFS_ZONE_OFFLINE; + zonefs_inode_update_mode(inode); + data_size = 0; + } else if ((z->z_flags & ZONEFS_ZONE_READONLY) || + (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { zonefs_warn(sb, "inode %lu: write access disabled\n", inode->i_ino); - if (zone->cond != BLK_ZONE_COND_READONLY) { - zone->cond = BLK_ZONE_COND_READONLY; - data_size = zonefs_check_zone_condition(inode, zone, - false, false); - } + if (!(z->z_flags & ZONEFS_ZONE_READONLY)) + z->z_flags |= ZONEFS_ZONE_READONLY; + zonefs_inode_update_mode(inode); + data_size = isize; } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO && data_size > isize) { /* Do not expose garbage data */ @@ -455,9 +332,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * close of the zone when the inode file is closed. */ if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && - (zone->cond == BLK_ZONE_COND_OFFLINE || - zone->cond == BLK_ZONE_COND_READONLY)) - zi->i_flags &= ~ZONEFS_ZONE_OPEN; + (z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) + z->z_flags &= ~ZONEFS_ZONE_OPEN; /* * If error=remount-ro was specified, any error result in remounting @@ -474,8 +350,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, */ zonefs_update_stats(inode, data_size); zonefs_i_size_write(inode, data_size); - zi->i_wpoffset = data_size; - zonefs_account_active(inode); + z->z_wpoffset = data_size; + zonefs_inode_account_active(inode); return 0; } @@ -487,9 +363,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * eventually correct the file size and zonefs inode write pointer offset * (which can be out of sync with the drive due to partial write failures). */ -static void __zonefs_io_error(struct inode *inode, bool write) +void __zonefs_io_error(struct inode *inode, bool write) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); unsigned int noio_flag; @@ -505,8 +381,8 @@ static void __zonefs_io_error(struct inode *inode, bool write) * files with aggregated conventional zones, for which the inode zone * size is always larger than the device zone size. */ - if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev)) - nr_zones = zi->i_zone_size >> + if (z->z_size > bdev_zone_sectors(sb->s_bdev)) + nr_zones = z->z_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT); /* @@ -518,7 +394,7 @@ static void __zonefs_io_error(struct inode *inode, bool write) * the GFP_NOIO context avoids both problems. */ noio_flag = memalloc_noio_save(); - ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones, + ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones, zonefs_io_error_cb, &err); if (ret != nr_zones) zonefs_err(sb, "Get inode %lu zone information failed %d\n", @@ -526,749 +402,6 @@ static void __zonefs_io_error(struct inode *inode, bool write) memalloc_noio_restore(noio_flag); } -static void zonefs_io_error(struct inode *inode, bool write) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - mutex_lock(&zi->i_truncate_mutex); - __zonefs_io_error(inode, write); - mutex_unlock(&zi->i_truncate_mutex); -} - -static int zonefs_file_truncate(struct inode *inode, loff_t isize) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - loff_t old_isize; - enum req_op op; - int ret = 0; - - /* - * Only sequential zone files can be truncated and truncation is allowed - * only down to a 0 size, which is equivalent to a zone reset, and to - * the maximum file size, which is equivalent to a zone finish. - */ - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) - return -EPERM; - - if (!isize) - op = REQ_OP_ZONE_RESET; - else if (isize == zi->i_max_size) - op = REQ_OP_ZONE_FINISH; - else - return -EPERM; - - inode_dio_wait(inode); - - /* Serialize against page faults */ - filemap_invalidate_lock(inode->i_mapping); - - /* Serialize against zonefs_iomap_begin() */ - mutex_lock(&zi->i_truncate_mutex); - - old_isize = i_size_read(inode); - if (isize == old_isize) - goto unlock; - - ret = zonefs_zone_mgmt(inode, op); - if (ret) - goto unlock; - - /* - * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, - * take care of open zones. - */ - if (zi->i_flags & ZONEFS_ZONE_OPEN) { - /* - * Truncating a zone to EMPTY or FULL is the equivalent of - * closing the zone. For a truncation to 0, we need to - * re-open the zone to ensure new writes can be processed. - * For a truncation to the maximum file size, the zone is - * closed and writes cannot be accepted anymore, so clear - * the open flag. - */ - if (!isize) - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); - else - zi->i_flags &= ~ZONEFS_ZONE_OPEN; - } - - zonefs_update_stats(inode, isize); - truncate_setsize(inode, isize); - zi->i_wpoffset = isize; - zonefs_account_active(inode); - -unlock: - mutex_unlock(&zi->i_truncate_mutex); - filemap_invalidate_unlock(inode->i_mapping); - - return ret; -} - -static int zonefs_inode_setattr(struct user_namespace *mnt_userns, - struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = d_inode(dentry); - int ret; - - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - - ret = setattr_prepare(&init_user_ns, dentry, iattr); - if (ret) - return ret; - - /* - * Since files and directories cannot be created nor deleted, do not - * allow setting any write attributes on the sub-directories grouping - * files by zone type. - */ - if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && - (iattr->ia_mode & 0222)) - return -EPERM; - - if (((iattr->ia_valid & ATTR_UID) && - !uid_eq(iattr->ia_uid, inode->i_uid)) || - ((iattr->ia_valid & ATTR_GID) && - !gid_eq(iattr->ia_gid, inode->i_gid))) { - ret = dquot_transfer(mnt_userns, inode, iattr); - if (ret) - return ret; - } - - if (iattr->ia_valid & ATTR_SIZE) { - ret = zonefs_file_truncate(inode, iattr->ia_size); - if (ret) - return ret; - } - - setattr_copy(&init_user_ns, inode, iattr); - - return 0; -} - -static const struct inode_operations zonefs_file_inode_operations = { - .setattr = zonefs_inode_setattr, -}; - -static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - struct inode *inode = file_inode(file); - int ret = 0; - - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - - /* - * Since only direct writes are allowed in sequential files, page cache - * flush is needed only for conventional zone files. - */ - if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) - ret = file_write_and_wait_range(file, start, end); - if (!ret) - ret = blkdev_issue_flush(inode->i_sb->s_bdev); - - if (ret) - zonefs_io_error(inode, true); - - return ret; -} - -static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - vm_fault_t ret; - - if (unlikely(IS_IMMUTABLE(inode))) - return VM_FAULT_SIGBUS; - - /* - * Sanity check: only conventional zone files can have shared - * writeable mappings. - */ - if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) - return VM_FAULT_NOPAGE; - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - - /* Serialize against truncates */ - filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); - filemap_invalidate_unlock_shared(inode->i_mapping); - - sb_end_pagefault(inode->i_sb); - return ret; -} - -static const struct vm_operations_struct zonefs_file_vm_ops = { - .fault = filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = zonefs_filemap_page_mkwrite, -}; - -static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) -{ - /* - * Conventional zones accept random writes, so their files can support - * shared writable mappings. For sequential zone files, only read - * mappings are possible since there are no guarantees for write - * ordering between msync() and page cache writeback. - */ - if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && - (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) - return -EINVAL; - - file_accessed(file); - vma->vm_ops = &zonefs_file_vm_ops; - - return 0; -} - -static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) -{ - loff_t isize = i_size_read(file_inode(file)); - - /* - * Seeks are limited to below the zone size for conventional zones - * and below the zone write pointer for sequential zones. In both - * cases, this limit is the inode size. - */ - return generic_file_llseek_size(file, offset, whence, isize, isize); -} - -static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, - int error, unsigned int flags) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - if (error) { - zonefs_io_error(inode, true); - return error; - } - - if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { - /* - * Note that we may be seeing completions out of order, - * but that is not a problem since a write completed - * successfully necessarily means that all preceding writes - * were also successful. So we can safely increase the inode - * size to the write end location. - */ - mutex_lock(&zi->i_truncate_mutex); - if (i_size_read(inode) < iocb->ki_pos + size) { - zonefs_update_stats(inode, iocb->ki_pos + size); - zonefs_i_size_write(inode, iocb->ki_pos + size); - } - mutex_unlock(&zi->i_truncate_mutex); - } - - return 0; -} - -static const struct iomap_dio_ops zonefs_write_dio_ops = { - .end_io = zonefs_file_write_dio_end_io, -}; - -static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int max = bdev_max_zone_append_sectors(bdev); - struct bio *bio; - ssize_t size; - int nr_pages; - ssize_t ret; - - max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); - iov_iter_truncate(from, max); - - nr_pages = iov_iter_npages(from, BIO_MAX_VECS); - if (!nr_pages) - return 0; - - bio = bio_alloc(bdev, nr_pages, - REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); - bio->bi_iter.bi_sector = zi->i_zsector; - bio->bi_ioprio = iocb->ki_ioprio; - if (iocb_is_dsync(iocb)) - bio->bi_opf |= REQ_FUA; - - ret = bio_iov_iter_get_pages(bio, from); - if (unlikely(ret)) - goto out_release; - - size = bio->bi_iter.bi_size; - task_io_account_write(size); - - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(bio, iocb); - - ret = submit_bio_wait(bio); - - /* - * If the file zone was written underneath the file system, the zone - * write pointer may not be where we expect it to be, but the zone - * append write can still succeed. So check manually that we wrote where - * we intended to, that is, at zi->i_wpoffset. - */ - if (!ret) { - sector_t wpsector = - zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); - - if (bio->bi_iter.bi_sector != wpsector) { - zonefs_warn(inode->i_sb, - "Corrupted write pointer %llu for zone at %llu\n", - wpsector, zi->i_zsector); - ret = -EIO; - } - } - - zonefs_file_write_dio_end_io(iocb, size, ret, 0); - trace_zonefs_file_dio_append(inode, size, ret); - -out_release: - bio_release_pages(bio, false); - bio_put(bio); - - if (ret >= 0) { - iocb->ki_pos += size; - return size; - } - - return ret; -} - -/* - * Do not exceed the LFS limits nor the file zone size. If pos is under the - * limit it becomes a short access. If it exceeds the limit, return -EFBIG. - */ -static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, - loff_t count) -{ - struct inode *inode = file_inode(file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - loff_t limit = rlimit(RLIMIT_FSIZE); - loff_t max_size = zi->i_max_size; - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - count = min(count, limit - pos); - } - - if (!(file->f_flags & O_LARGEFILE)) - max_size = min_t(loff_t, MAX_NON_LFS, max_size); - - if (unlikely(pos >= max_size)) - return -EFBIG; - - return min(count, max_size - pos); -} - -static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - loff_t count; - - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - - if (!iov_iter_count(from)) - return 0; - - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) - return -EINVAL; - - if (iocb->ki_flags & IOCB_APPEND) { - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) - return -EINVAL; - mutex_lock(&zi->i_truncate_mutex); - iocb->ki_pos = zi->i_wpoffset; - mutex_unlock(&zi->i_truncate_mutex); - } - - count = zonefs_write_check_limits(file, iocb->ki_pos, - iov_iter_count(from)); - if (count < 0) - return count; - - iov_iter_truncate(from, count); - return iov_iter_count(from); -} - -/* - * Handle direct writes. For sequential zone files, this is the only possible - * write path. For these files, check that the user is issuing writes - * sequentially from the end of the file. This code assumes that the block layer - * delivers write requests to the device in sequential order. This is always the - * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE - * elevator feature is being used (e.g. mq-deadline). The block layer always - * automatically select such an elevator for zoned block devices during the - * device initialization. - */ -static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - bool sync = is_sync_kiocb(iocb); - bool append = false; - ssize_t ret, count; - - /* - * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT - * as this can cause write reordering (e.g. the first aio gets EAGAIN - * on the inode lock but the second goes through but is now unaligned). - */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && - (iocb->ki_flags & IOCB_NOWAIT)) - return -EOPNOTSUPP; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; - } else { - inode_lock(inode); - } - - count = zonefs_write_checks(iocb, from); - if (count <= 0) { - ret = count; - goto inode_unlock; - } - - if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { - ret = -EINVAL; - goto inode_unlock; - } - - /* Enforce sequential writes (append only) in sequential zones */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { - mutex_lock(&zi->i_truncate_mutex); - if (iocb->ki_pos != zi->i_wpoffset) { - mutex_unlock(&zi->i_truncate_mutex); - ret = -EINVAL; - goto inode_unlock; - } - mutex_unlock(&zi->i_truncate_mutex); - append = sync; - } - - if (append) - ret = zonefs_file_dio_append(iocb, from); - else - ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, - &zonefs_write_dio_ops, 0, NULL, 0); - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && - (ret > 0 || ret == -EIOCBQUEUED)) { - if (ret > 0) - count = ret; - - /* - * Update the zone write pointer offset assuming the write - * operation succeeded. If it did not, the error recovery path - * will correct it. Also do active seq file accounting. - */ - mutex_lock(&zi->i_truncate_mutex); - zi->i_wpoffset += count; - zonefs_account_active(inode); - mutex_unlock(&zi->i_truncate_mutex); - } - -inode_unlock: - inode_unlock(inode); - - return ret; -} - -static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, - struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - ssize_t ret; - - /* - * Direct IO writes are mandatory for sequential zone files so that the - * write IO issuing order is preserved. - */ - if (zi->i_ztype != ZONEFS_ZTYPE_CNV) - return -EIO; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; - } else { - inode_lock(inode); - } - - ret = zonefs_write_checks(iocb, from); - if (ret <= 0) - goto inode_unlock; - - ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); - if (ret > 0) - iocb->ki_pos += ret; - else if (ret == -EIO) - zonefs_io_error(inode, true); - -inode_unlock: - inode_unlock(inode); - if (ret > 0) - ret = generic_write_sync(iocb, ret); - - return ret; -} - -static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - - if (sb_rdonly(inode->i_sb)) - return -EROFS; - - /* Write operations beyond the zone size are not allowed */ - if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) - return -EFBIG; - - if (iocb->ki_flags & IOCB_DIRECT) { - ssize_t ret = zonefs_file_dio_write(iocb, from); - if (ret != -ENOTBLK) - return ret; - } - - return zonefs_file_buffered_write(iocb, from); -} - -static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, - int error, unsigned int flags) -{ - if (error) { - zonefs_io_error(file_inode(iocb->ki_filp), false); - return error; - } - - return 0; -} - -static const struct iomap_dio_ops zonefs_read_dio_ops = { - .end_io = zonefs_file_read_dio_end_io, -}; - -static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - loff_t isize; - ssize_t ret; - - /* Offline zones cannot be read */ - if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) - return -EPERM; - - if (iocb->ki_pos >= zi->i_max_size) - return 0; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock_shared(inode)) - return -EAGAIN; - } else { - inode_lock_shared(inode); - } - - /* Limit read operations to written data */ - mutex_lock(&zi->i_truncate_mutex); - isize = i_size_read(inode); - if (iocb->ki_pos >= isize) { - mutex_unlock(&zi->i_truncate_mutex); - ret = 0; - goto inode_unlock; - } - iov_iter_truncate(to, isize - iocb->ki_pos); - mutex_unlock(&zi->i_truncate_mutex); - - if (iocb->ki_flags & IOCB_DIRECT) { - size_t count = iov_iter_count(to); - - if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { - ret = -EINVAL; - goto inode_unlock; - } - file_accessed(iocb->ki_filp); - ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, - &zonefs_read_dio_ops, 0, NULL, 0); - } else { - ret = generic_file_read_iter(iocb, to); - if (ret == -EIO) - zonefs_io_error(inode, false); - } - -inode_unlock: - inode_unlock_shared(inode); - - return ret; -} - -/* - * Write open accounting is done only for sequential files. - */ -static inline bool zonefs_seq_file_need_wro(struct inode *inode, - struct file *file) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) - return false; - - if (!(file->f_mode & FMODE_WRITE)) - return false; - - return true; -} - -static int zonefs_seq_file_write_open(struct inode *inode) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - int ret = 0; - - mutex_lock(&zi->i_truncate_mutex); - - if (!zi->i_wr_refcnt) { - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); - - if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { - - if (sbi->s_max_wro_seq_files - && wro > sbi->s_max_wro_seq_files) { - atomic_dec(&sbi->s_wro_seq_files); - ret = -EBUSY; - goto unlock; - } - - if (i_size_read(inode) < zi->i_max_size) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); - if (ret) { - atomic_dec(&sbi->s_wro_seq_files); - goto unlock; - } - zi->i_flags |= ZONEFS_ZONE_OPEN; - zonefs_account_active(inode); - } - } - } - - zi->i_wr_refcnt++; - -unlock: - mutex_unlock(&zi->i_truncate_mutex); - - return ret; -} - -static int zonefs_file_open(struct inode *inode, struct file *file) -{ - int ret; - - ret = generic_file_open(inode, file); - if (ret) - return ret; - - if (zonefs_seq_file_need_wro(inode, file)) - return zonefs_seq_file_write_open(inode); - - return 0; -} - -static void zonefs_seq_file_write_close(struct inode *inode) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - int ret = 0; - - mutex_lock(&zi->i_truncate_mutex); - - zi->i_wr_refcnt--; - if (zi->i_wr_refcnt) - goto unlock; - - /* - * The file zone may not be open anymore (e.g. the file was truncated to - * its maximum size or it was fully written). For this case, we only - * need to decrement the write open count. - */ - if (zi->i_flags & ZONEFS_ZONE_OPEN) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); - if (ret) { - __zonefs_io_error(inode, false); - /* - * Leaving zones explicitly open may lead to a state - * where most zones cannot be written (zone resources - * exhausted). So take preventive action by remounting - * read-only. - */ - if (zi->i_flags & ZONEFS_ZONE_OPEN && - !(sb->s_flags & SB_RDONLY)) { - zonefs_warn(sb, - "closing zone at %llu failed %d\n", - zi->i_zsector, ret); - zonefs_warn(sb, - "remounting filesystem read-only\n"); - sb->s_flags |= SB_RDONLY; - } - goto unlock; - } - - zi->i_flags &= ~ZONEFS_ZONE_OPEN; - zonefs_account_active(inode); - } - - atomic_dec(&sbi->s_wro_seq_files); - -unlock: - mutex_unlock(&zi->i_truncate_mutex); -} - -static int zonefs_file_release(struct inode *inode, struct file *file) -{ - /* - * If we explicitly open a zone we must close it again as well, but the - * zone management operation can fail (either due to an IO error or as - * the zone has gone offline or read-only). Make sure we don't fail the - * close(2) for user-space. - */ - if (zonefs_seq_file_need_wro(inode, file)) - zonefs_seq_file_write_close(inode); - - return 0; -} - -static const struct file_operations zonefs_file_operations = { - .open = zonefs_file_open, - .release = zonefs_file_release, - .fsync = zonefs_file_fsync, - .mmap = zonefs_file_mmap, - .llseek = zonefs_file_llseek, - .read_iter = zonefs_file_read_iter, - .write_iter = zonefs_file_write_iter, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .iopoll = iocb_bio_iopoll, -}; - static struct kmem_cache *zonefs_inode_cachep; static struct inode *zonefs_alloc_inode(struct super_block *sb) @@ -1282,7 +415,6 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); zi->i_wr_refcnt = 0; - zi->i_flags = 0; return &zi->i_vnode; } @@ -1315,8 +447,8 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { - if (sbi->s_nr_files[t]) - buf->f_files += sbi->s_nr_files[t] + 1; + if (sbi->s_zgroup[t].g_nr_zones) + buf->f_files += sbi->s_zgroup[t].g_nr_zones + 1; } buf->f_ffree = 0; @@ -1408,185 +540,440 @@ static int zonefs_remount(struct super_block *sb, int *flags, char *data) return zonefs_parse_options(sb, data); } -static const struct super_operations zonefs_sops = { - .alloc_inode = zonefs_alloc_inode, - .free_inode = zonefs_free_inode, - .statfs = zonefs_statfs, - .remount_fs = zonefs_remount, - .show_options = zonefs_show_options, -}; +static int zonefs_inode_setattr(struct mnt_idmap *idmap, + struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + int ret; -static const struct inode_operations zonefs_dir_inode_operations = { - .lookup = simple_lookup, + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr); + if (ret) + return ret; + + /* + * Since files and directories cannot be created nor deleted, do not + * allow setting any write attributes on the sub-directories grouping + * files by zone type. + */ + if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && + (iattr->ia_mode & 0222)) + return -EPERM; + + if (((iattr->ia_valid & ATTR_UID) && + !uid_eq(iattr->ia_uid, inode->i_uid)) || + ((iattr->ia_valid & ATTR_GID) && + !gid_eq(iattr->ia_gid, inode->i_gid))) { + ret = dquot_transfer(&nop_mnt_idmap, inode, iattr); + if (ret) + return ret; + } + + if (iattr->ia_valid & ATTR_SIZE) { + ret = zonefs_file_truncate(inode, iattr->ia_size); + if (ret) + return ret; + } + + setattr_copy(&nop_mnt_idmap, inode, iattr); + + if (S_ISREG(inode->i_mode)) { + struct zonefs_zone *z = zonefs_inode_zone(inode); + + z->z_mode = inode->i_mode; + z->z_uid = inode->i_uid; + z->z_gid = inode->i_gid; + } + + return 0; +} + +static const struct inode_operations zonefs_file_inode_operations = { .setattr = zonefs_inode_setattr, }; -static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, - enum zonefs_ztype type) +static long zonefs_fname_to_fno(const struct qstr *fname) { - struct super_block *sb = parent->i_sb; + const char *name = fname->name; + unsigned int len = fname->len; + long fno = 0, shift = 1; + const char *rname; + char c = *name; + unsigned int i; - inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1; - inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555); - inode->i_op = &zonefs_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - set_nlink(inode, 2); - inc_nlink(parent); + /* + * File names are always a base-10 number string without any + * leading 0s. + */ + if (!isdigit(c)) + return -ENOENT; + + if (len > 1 && c == '0') + return -ENOENT; + + if (len == 1) + return c - '0'; + + for (i = 0, rname = name + len - 1; i < len; i++, rname--) { + c = *rname; + if (!isdigit(c)) + return -ENOENT; + fno += (c - '0') * shift; + shift *= 10; + } + + return fno; } -static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, - enum zonefs_ztype type) +static struct inode *zonefs_get_file_inode(struct inode *dir, + struct dentry *dentry) { - struct super_block *sb = inode->i_sb; + struct zonefs_zone_group *zgroup = dir->i_private; + struct super_block *sb = dir->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - int ret = 0; + struct zonefs_zone *z; + struct inode *inode; + ino_t ino; + long fno; - inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; - inode->i_mode = S_IFREG | sbi->s_perm; + /* Get the file number from the file name */ + fno = zonefs_fname_to_fno(&dentry->d_name); + if (fno < 0) + return ERR_PTR(fno); - zi->i_ztype = type; - zi->i_zsector = zone->start; - zi->i_zone_size = zone->len << SECTOR_SHIFT; - if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && - !(sbi->s_features & ZONEFS_F_AGGRCNV)) { - zonefs_err(sb, - "zone size %llu doesn't match device's zone sectors %llu\n", - zi->i_zone_size, - bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); - return -EINVAL; - } + if (!zgroup->g_nr_zones || fno >= zgroup->g_nr_zones) + return ERR_PTR(-ENOENT); - zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE, - zone->capacity << SECTOR_SHIFT); - zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true); + z = &zgroup->g_zones[fno]; + ino = z->z_sector >> sbi->s_zone_sectors_shift; + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) { + WARN_ON_ONCE(inode->i_private != z); + return inode; + } - inode->i_uid = sbi->s_uid; - inode->i_gid = sbi->s_gid; - inode->i_size = zi->i_wpoffset; - inode->i_blocks = zi->i_max_size >> SECTOR_SHIFT; + inode->i_ino = ino; + inode->i_mode = z->z_mode; + inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; + inode->i_uid = z->z_uid; + inode->i_gid = z->z_gid; + inode->i_size = z->z_wpoffset; + inode->i_blocks = z->z_capacity >> SECTOR_SHIFT; + inode->i_private = z; inode->i_op = &zonefs_file_inode_operations; inode->i_fop = &zonefs_file_operations; inode->i_mapping->a_ops = &zonefs_file_aops; - sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); - sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; - sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; + /* Update the inode access rights depending on the zone condition */ + zonefs_inode_update_mode(inode); + + unlock_new_inode(inode); - mutex_lock(&zi->i_truncate_mutex); + return inode; +} + +static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, + enum zonefs_ztype ztype) +{ + struct inode *root = d_inode(sb->s_root); + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct inode *inode; + ino_t ino = bdev_nr_zones(sb->s_bdev) + ztype + 1; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + inode->i_ino = ino; + inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555); + inode->i_size = sbi->s_zgroup[ztype].g_nr_zones; + inode->i_ctime = inode->i_mtime = inode->i_atime = root->i_ctime; + inode->i_private = &sbi->s_zgroup[ztype]; + set_nlink(inode, 2); + + inode->i_op = &zonefs_dir_inode_operations; + inode->i_fop = &zonefs_dir_operations; + + unlock_new_inode(inode); + + return inode; +} + + +static struct inode *zonefs_get_dir_inode(struct inode *dir, + struct dentry *dentry) +{ + struct super_block *sb = dir->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + const char *name = dentry->d_name.name; + enum zonefs_ztype ztype; /* - * For sequential zones, make sure that any open zone is closed first - * to ensure that the initial number of open zones is 0, in sync with - * the open zone accounting done when the mount option - * ZONEFS_MNTOPT_EXPLICIT_OPEN is used. + * We only need to check for the "seq" directory and + * the "cnv" directory if we have conventional zones. */ - if (type == ZONEFS_ZTYPE_SEQ && - (zone->cond == BLK_ZONE_COND_IMP_OPEN || - zone->cond == BLK_ZONE_COND_EXP_OPEN)) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); - if (ret) - goto unlock; + if (dentry->d_name.len != 3) + return ERR_PTR(-ENOENT); + + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + if (sbi->s_zgroup[ztype].g_nr_zones && + memcmp(name, zonefs_zgroup_name(ztype), 3) == 0) + break; } + if (ztype == ZONEFS_ZTYPE_MAX) + return ERR_PTR(-ENOENT); + + return zonefs_get_zgroup_inode(sb, ztype); +} - zonefs_account_active(inode); +static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode; -unlock: - mutex_unlock(&zi->i_truncate_mutex); + if (dentry->d_name.len > ZONEFS_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); - return ret; + if (dir == d_inode(dir->i_sb->s_root)) + inode = zonefs_get_dir_inode(dir, dentry); + else + inode = zonefs_get_file_inode(dir, dentry); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_splice_alias(inode, dentry); } -static struct dentry *zonefs_create_inode(struct dentry *parent, - const char *name, struct blk_zone *zone, - enum zonefs_ztype type) +static int zonefs_readdir_root(struct file *file, struct dir_context *ctx) { - struct inode *dir = d_inode(parent); - struct dentry *dentry; - struct inode *inode; - int ret = -ENOMEM; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + enum zonefs_ztype ztype = ZONEFS_ZTYPE_CNV; + ino_t base_ino = bdev_nr_zones(sb->s_bdev) + 1; - dentry = d_alloc_name(parent, name); - if (!dentry) - return ERR_PTR(ret); + if (ctx->pos >= inode->i_size) + return 0; - inode = new_inode(parent->d_sb); - if (!inode) - goto dput; + if (!dir_emit_dots(file, ctx)) + return 0; - inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; - if (zone) { - ret = zonefs_init_file_inode(inode, zone, type); - if (ret) { - iput(inode); - goto dput; - } - } else { - zonefs_init_dir_inode(dir, inode, type); + if (ctx->pos == 2) { + if (!sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) + ztype = ZONEFS_ZTYPE_SEQ; + + if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3, + base_ino + ztype, DT_DIR)) + return 0; + ctx->pos++; } - d_add(dentry, inode); - dir->i_size++; + if (ctx->pos == 3 && ztype != ZONEFS_ZTYPE_SEQ) { + ztype = ZONEFS_ZTYPE_SEQ; + if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3, + base_ino + ztype, DT_DIR)) + return 0; + ctx->pos++; + } - return dentry; + return 0; +} -dput: - dput(dentry); +static int zonefs_readdir_zgroup(struct file *file, + struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct zonefs_zone_group *zgroup = inode->i_private; + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct zonefs_zone *z; + int fname_len; + char *fname; + ino_t ino; + int f; - return ERR_PTR(ret); + /* + * The size of zone group directories is equal to the number + * of zone files in the group and does note include the "." and + * ".." entries. Hence the "+ 2" here. + */ + if (ctx->pos >= inode->i_size + 2) + return 0; + + if (!dir_emit_dots(file, ctx)) + return 0; + + fname = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL); + if (!fname) + return -ENOMEM; + + for (f = ctx->pos - 2; f < zgroup->g_nr_zones; f++) { + z = &zgroup->g_zones[f]; + ino = z->z_sector >> sbi->s_zone_sectors_shift; + fname_len = snprintf(fname, ZONEFS_NAME_MAX - 1, "%u", f); + if (!dir_emit(ctx, fname, fname_len, ino, DT_REG)) + break; + ctx->pos++; + } + + kfree(fname); + + return 0; } +static int zonefs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + + if (inode == d_inode(inode->i_sb->s_root)) + return zonefs_readdir_root(file, ctx); + + return zonefs_readdir_zgroup(file, ctx); +} + +const struct inode_operations zonefs_dir_inode_operations = { + .lookup = zonefs_lookup, + .setattr = zonefs_inode_setattr, +}; + +const struct file_operations zonefs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = zonefs_readdir, +}; + struct zonefs_zone_data { struct super_block *sb; unsigned int nr_zones[ZONEFS_ZTYPE_MAX]; + sector_t cnv_zone_start; struct blk_zone *zones; }; +static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct zonefs_zone_data *zd = data; + struct super_block *sb = zd->sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + + /* + * We do not care about the first zone: it contains the super block + * and not exposed as a file. + */ + if (!idx) + return 0; + + /* + * Count the number of zones that will be exposed as files. + * For sequential zones, we always have as many files as zones. + * FOr conventional zones, the number of files depends on if we have + * conventional zones aggregation enabled. + */ + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + if (sbi->s_features & ZONEFS_F_AGGRCNV) { + /* One file per set of contiguous conventional zones */ + if (!(sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) || + zone->start != zd->cnv_zone_start) + sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; + zd->cnv_zone_start = zone->start + zone->len; + } else { + /* One file per zone */ + sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; + } + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + sbi->s_zgroup[ZONEFS_ZTYPE_SEQ].g_nr_zones++; + break; + default: + zonefs_err(zd->sb, "Unsupported zone type 0x%x\n", + zone->type); + return -EIO; + } + + memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone)); + + return 0; +} + +static int zonefs_get_zone_info(struct zonefs_zone_data *zd) +{ + struct block_device *bdev = zd->sb->s_bdev; + int ret; + + zd->zones = kvcalloc(bdev_nr_zones(bdev), sizeof(struct blk_zone), + GFP_KERNEL); + if (!zd->zones) + return -ENOMEM; + + /* Get zones information from the device */ + ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, + zonefs_get_zone_info_cb, zd); + if (ret < 0) { + zonefs_err(zd->sb, "Zone report failed %d\n", ret); + return ret; + } + + if (ret != bdev_nr_zones(bdev)) { + zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n", + ret, bdev_nr_zones(bdev)); + return -EIO; + } + + return 0; +} + +static inline void zonefs_free_zone_info(struct zonefs_zone_data *zd) +{ + kvfree(zd->zones); +} + /* * Create a zone group and populate it with zone files. */ -static int zonefs_create_zgroup(struct zonefs_zone_data *zd, - enum zonefs_ztype type) +static int zonefs_init_zgroup(struct super_block *sb, + struct zonefs_zone_data *zd, + enum zonefs_ztype ztype) { - struct super_block *sb = zd->sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype]; struct blk_zone *zone, *next, *end; - const char *zgroup_name; - char *file_name; - struct dentry *dir, *dent; + struct zonefs_zone *z; unsigned int n = 0; int ret; - /* If the group is empty, there is nothing to do */ - if (!zd->nr_zones[type]) + /* Allocate the zone group. If it is empty, we have nothing to do. */ + if (!zgroup->g_nr_zones) return 0; - file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL); - if (!file_name) + zgroup->g_zones = kvcalloc(zgroup->g_nr_zones, + sizeof(struct zonefs_zone), GFP_KERNEL); + if (!zgroup->g_zones) return -ENOMEM; - if (type == ZONEFS_ZTYPE_CNV) - zgroup_name = "cnv"; - else - zgroup_name = "seq"; - - dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type); - if (IS_ERR(dir)) { - ret = PTR_ERR(dir); - goto free; - } - /* - * The first zone contains the super block: skip it. + * Initialize the zone groups using the device zone information. + * We always skip the first zone as it contains the super block + * and is not use to back a file. */ end = zd->zones + bdev_nr_zones(sb->s_bdev); for (zone = &zd->zones[1]; zone < end; zone = next) { next = zone + 1; - if (zonefs_zone_type(zone) != type) + if (zonefs_zone_type(zone) != ztype) continue; + if (WARN_ON_ONCE(n >= zgroup->g_nr_zones)) + return -EINVAL; + /* * For conventional zones, contiguous zones can be aggregated * together to form larger files. Note that this overwrites the @@ -1595,10 +982,10 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, * found, assume that all zones aggregated have the same * condition. */ - if (type == ZONEFS_ZTYPE_CNV && + if (ztype == ZONEFS_ZTYPE_CNV && (sbi->s_features & ZONEFS_F_AGGRCNV)) { for (; next < end; next++) { - if (zonefs_zone_type(next) != type) + if (zonefs_zone_type(next) != ztype) break; zone->len += next->len; zone->capacity += next->capacity; @@ -1608,99 +995,118 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, else if (next->cond == BLK_ZONE_COND_OFFLINE) zone->cond = BLK_ZONE_COND_OFFLINE; } - if (zone->capacity != zone->len) { - zonefs_err(sb, "Invalid conventional zone capacity\n"); - ret = -EINVAL; - goto free; - } } + z = &zgroup->g_zones[n]; + if (ztype == ZONEFS_ZTYPE_CNV) + z->z_flags |= ZONEFS_ZONE_CNV; + z->z_sector = zone->start; + z->z_size = zone->len << SECTOR_SHIFT; + if (z->z_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && + !(sbi->s_features & ZONEFS_F_AGGRCNV)) { + zonefs_err(sb, + "Invalid zone size %llu (device zone sectors %llu)\n", + z->z_size, + bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); + return -EINVAL; + } + + z->z_capacity = min_t(loff_t, MAX_LFS_FILESIZE, + zone->capacity << SECTOR_SHIFT); + z->z_wpoffset = zonefs_check_zone_condition(sb, z, zone); + + z->z_mode = S_IFREG | sbi->s_perm; + z->z_uid = sbi->s_uid; + z->z_gid = sbi->s_gid; + /* - * Use the file number within its group as file name. + * Let zonefs_inode_update_mode() know that we will need + * special initialization of the inode mode the first time + * it is accessed. */ - snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n); - dent = zonefs_create_inode(dir, file_name, zone, type); - if (IS_ERR(dent)) { - ret = PTR_ERR(dent); - goto free; + z->z_flags |= ZONEFS_ZONE_INIT_MODE; + + sb->s_maxbytes = max(z->z_capacity, sb->s_maxbytes); + sbi->s_blocks += z->z_capacity >> sb->s_blocksize_bits; + sbi->s_used_blocks += z->z_wpoffset >> sb->s_blocksize_bits; + + /* + * For sequential zones, make sure that any open zone is closed + * first to ensure that the initial number of open zones is 0, + * in sync with the open zone accounting done when the mount + * option ZONEFS_MNTOPT_EXPLICIT_OPEN is used. + */ + if (ztype == ZONEFS_ZTYPE_SEQ && + (zone->cond == BLK_ZONE_COND_IMP_OPEN || + zone->cond == BLK_ZONE_COND_EXP_OPEN)) { + ret = zonefs_zone_mgmt(sb, z, REQ_OP_ZONE_CLOSE); + if (ret) + return ret; } + zonefs_account_active(sb, z); + n++; } - zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", - zgroup_name, n, n > 1 ? "s" : ""); - - sbi->s_nr_files[type] = n; - ret = 0; + if (WARN_ON_ONCE(n != zgroup->g_nr_zones)) + return -EINVAL; -free: - kfree(file_name); + zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", + zonefs_zgroup_name(ztype), + zgroup->g_nr_zones, + zgroup->g_nr_zones > 1 ? "s" : ""); - return ret; + return 0; } -static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx, - void *data) +static void zonefs_free_zgroups(struct super_block *sb) { - struct zonefs_zone_data *zd = data; - - /* - * Count the number of usable zones: the first zone at index 0 contains - * the super block and is ignored. - */ - switch (zone->type) { - case BLK_ZONE_TYPE_CONVENTIONAL: - zone->wp = zone->start + zone->len; - if (idx) - zd->nr_zones[ZONEFS_ZTYPE_CNV]++; - break; - case BLK_ZONE_TYPE_SEQWRITE_REQ: - case BLK_ZONE_TYPE_SEQWRITE_PREF: - if (idx) - zd->nr_zones[ZONEFS_ZTYPE_SEQ]++; - break; - default: - zonefs_err(zd->sb, "Unsupported zone type 0x%x\n", - zone->type); - return -EIO; - } + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + enum zonefs_ztype ztype; - memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone)); + if (!sbi) + return; - return 0; + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + kvfree(sbi->s_zgroup[ztype].g_zones); + sbi->s_zgroup[ztype].g_zones = NULL; + } } -static int zonefs_get_zone_info(struct zonefs_zone_data *zd) +/* + * Create a zone group and populate it with zone files. + */ +static int zonefs_init_zgroups(struct super_block *sb) { - struct block_device *bdev = zd->sb->s_bdev; + struct zonefs_zone_data zd; + enum zonefs_ztype ztype; int ret; - zd->zones = kvcalloc(bdev_nr_zones(bdev), sizeof(struct blk_zone), - GFP_KERNEL); - if (!zd->zones) - return -ENOMEM; - - /* Get zones information from the device */ - ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, - zonefs_get_zone_info_cb, zd); - if (ret < 0) { - zonefs_err(zd->sb, "Zone report failed %d\n", ret); - return ret; - } + /* First get the device zone information */ + memset(&zd, 0, sizeof(struct zonefs_zone_data)); + zd.sb = sb; + ret = zonefs_get_zone_info(&zd); + if (ret) + goto cleanup; - if (ret != bdev_nr_zones(bdev)) { - zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n", - ret, bdev_nr_zones(bdev)); - return -EIO; + /* Allocate and initialize the zone groups */ + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + ret = zonefs_init_zgroup(sb, &zd, ztype); + if (ret) { + zonefs_info(sb, + "Zone group \"%s\" initialization failed\n", + zonefs_zgroup_name(ztype)); + break; + } } - return 0; -} +cleanup: + zonefs_free_zone_info(&zd); + if (ret) + zonefs_free_zgroups(sb); -static inline void zonefs_cleanup_zone_info(struct zonefs_zone_data *zd) -{ - kvfree(zd->zones); + return ret; } /* @@ -1785,6 +1191,50 @@ free_page: return ret; } +static const struct super_operations zonefs_sops = { + .alloc_inode = zonefs_alloc_inode, + .free_inode = zonefs_free_inode, + .statfs = zonefs_statfs, + .remount_fs = zonefs_remount, + .show_options = zonefs_show_options, +}; + +static int zonefs_get_zgroup_inodes(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct inode *dir_inode; + enum zonefs_ztype ztype; + + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + if (!sbi->s_zgroup[ztype].g_nr_zones) + continue; + + dir_inode = zonefs_get_zgroup_inode(sb, ztype); + if (IS_ERR(dir_inode)) + return PTR_ERR(dir_inode); + + sbi->s_zgroup[ztype].g_inode = dir_inode; + } + + return 0; +} + +static void zonefs_release_zgroup_inodes(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + enum zonefs_ztype ztype; + + if (!sbi) + return; + + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + if (sbi->s_zgroup[ztype].g_inode) { + iput(sbi->s_zgroup[ztype].g_inode); + sbi->s_zgroup[ztype].g_inode = NULL; + } + } +} + /* * Check that the device is zoned. If it is, get the list of zones and create * sub-directories and files according to the device zone configuration and @@ -1792,10 +1242,9 @@ free_page: */ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) { - struct zonefs_zone_data zd; struct zonefs_sb_info *sbi; struct inode *inode; - enum zonefs_ztype t; + enum zonefs_ztype ztype; int ret; if (!bdev_is_zoned(sb->s_bdev)) { @@ -1845,16 +1294,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) if (ret) return ret; - memset(&zd, 0, sizeof(struct zonefs_zone_data)); - zd.sb = sb; - ret = zonefs_get_zone_info(&zd); - if (ret) - goto cleanup; - - ret = zonefs_sysfs_register(sb); - if (ret) - goto cleanup; - zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev)); if (!sbi->s_max_wro_seq_files && @@ -1865,7 +1304,12 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; } - /* Create root directory inode */ + /* Initialize the zone groups */ + ret = zonefs_init_zgroups(sb); + if (ret) + goto cleanup; + + /* Create the root directory inode */ ret = -ENOMEM; inode = new_inode(sb); if (!inode) @@ -1875,22 +1319,37 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) inode->i_mode = S_IFDIR | 0555; inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode); inode->i_op = &zonefs_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + inode->i_fop = &zonefs_dir_operations; + inode->i_size = 2; set_nlink(inode, 2); + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + if (sbi->s_zgroup[ztype].g_nr_zones) { + inc_nlink(inode); + inode->i_size++; + } + } sb->s_root = d_make_root(inode); if (!sb->s_root) goto cleanup; - /* Create and populate files in zone groups directories */ - for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { - ret = zonefs_create_zgroup(&zd, t); - if (ret) - break; - } + /* + * Take a reference on the zone groups directory inodes + * to keep them in the inode cache. + */ + ret = zonefs_get_zgroup_inodes(sb); + if (ret) + goto cleanup; + + ret = zonefs_sysfs_register(sb); + if (ret) + goto cleanup; + + return 0; cleanup: - zonefs_cleanup_zone_info(&zd); + zonefs_release_zgroup_inodes(sb); + zonefs_free_zgroups(sb); return ret; } @@ -1905,11 +1364,13 @@ static void zonefs_kill_super(struct super_block *sb) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - if (sb->s_root) - d_genocide(sb->s_root); + /* Release the reference on the zone group directory inodes */ + zonefs_release_zgroup_inodes(sb); - zonefs_sysfs_unregister(sb); kill_block_super(sb); + + zonefs_sysfs_unregister(sb); + zonefs_free_zgroups(sb); kfree(sbi); } diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c index 9920689dc098..8ccb65c2b419 100644 --- a/fs/zonefs/sysfs.c +++ b/fs/zonefs/sysfs.c @@ -79,7 +79,7 @@ static const struct sysfs_ops zonefs_sysfs_attr_ops = { .show = zonefs_sysfs_attr_show, }; -static struct kobj_type zonefs_sb_ktype = { +static const struct kobj_type zonefs_sb_ktype = { .default_groups = zonefs_sysfs_groups, .sysfs_ops = &zonefs_sysfs_attr_ops, .release = zonefs_sysfs_sb_release, diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h index 42edcfd393ed..9969db3a9c7d 100644 --- a/fs/zonefs/trace.h +++ b/fs/zonefs/trace.h @@ -20,8 +20,9 @@ #define show_dev(dev) MAJOR(dev), MINOR(dev) TRACE_EVENT(zonefs_zone_mgmt, - TP_PROTO(struct inode *inode, enum req_op op), - TP_ARGS(inode, op), + TP_PROTO(struct super_block *sb, struct zonefs_zone *z, + enum req_op op), + TP_ARGS(sb, z, op), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) @@ -30,12 +31,12 @@ TRACE_EVENT(zonefs_zone_mgmt, __field(sector_t, nr_sectors) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; + __entry->dev = sb->s_dev; + __entry->ino = + z->z_sector >> ZONEFS_SB(sb)->s_zone_sectors_shift; __entry->op = op; - __entry->sector = ZONEFS_I(inode)->i_zsector; - __entry->nr_sectors = - ZONEFS_I(inode)->i_zone_size >> SECTOR_SHIFT; + __entry->sector = z->z_sector; + __entry->nr_sectors = z->z_size >> SECTOR_SHIFT; ), TP_printk("bdev=(%d,%d), ino=%lu op=%s, sector=%llu, nr_sectors=%llu", show_dev(__entry->dev), (unsigned long)__entry->ino, @@ -58,9 +59,10 @@ TRACE_EVENT(zonefs_file_dio_append, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->sector = ZONEFS_I(inode)->i_zsector; + __entry->sector = zonefs_inode_zone(inode)->z_sector; __entry->size = size; - __entry->wpoffset = ZONEFS_I(inode)->i_wpoffset; + __entry->wpoffset = + zonefs_inode_zone(inode)->z_wpoffset; __entry->ret = ret; ), TP_printk("bdev=(%d, %d), ino=%lu, sector=%llu, size=%zu, wpoffset=%llu, ret=%zu", diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 1dbe78119ff1..8175652241b5 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -39,31 +39,53 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) return ZONEFS_ZTYPE_SEQ; } -#define ZONEFS_ZONE_OPEN (1U << 0) -#define ZONEFS_ZONE_ACTIVE (1U << 1) -#define ZONEFS_ZONE_OFFLINE (1U << 2) -#define ZONEFS_ZONE_READONLY (1U << 3) +#define ZONEFS_ZONE_INIT_MODE (1U << 0) +#define ZONEFS_ZONE_OPEN (1U << 1) +#define ZONEFS_ZONE_ACTIVE (1U << 2) +#define ZONEFS_ZONE_OFFLINE (1U << 3) +#define ZONEFS_ZONE_READONLY (1U << 4) +#define ZONEFS_ZONE_CNV (1U << 31) /* - * In-memory inode data. + * In-memory per-file inode zone data. */ -struct zonefs_inode_info { - struct inode i_vnode; +struct zonefs_zone { + /* Zone state flags */ + unsigned int z_flags; - /* File zone type */ - enum zonefs_ztype i_ztype; + /* Zone start sector (512B unit) */ + sector_t z_sector; - /* File zone start sector (512B unit) */ - sector_t i_zsector; + /* Zone size (bytes) */ + loff_t z_size; - /* File zone write pointer position (sequential zones only) */ - loff_t i_wpoffset; + /* Zone capacity (file maximum size, bytes) */ + loff_t z_capacity; - /* File maximum size */ - loff_t i_max_size; + /* Write pointer offset in the zone (sequential zones only, bytes) */ + loff_t z_wpoffset; + + /* Saved inode uid, gid and access rights */ + umode_t z_mode; + kuid_t z_uid; + kgid_t z_gid; +}; + +/* + * In memory zone group information: all zones of a group are exposed + * as files, one file per zone. + */ +struct zonefs_zone_group { + struct inode *g_inode; + unsigned int g_nr_zones; + struct zonefs_zone *g_zones; +}; - /* File zone size */ - loff_t i_zone_size; +/* + * In-memory inode data. + */ +struct zonefs_inode_info { + struct inode i_vnode; /* * To serialise fully against both syscall and mmap based IO and @@ -82,7 +104,6 @@ struct zonefs_inode_info { /* guarded by i_truncate_mutex */ unsigned int i_wr_refcnt; - unsigned int i_flags; }; static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) @@ -90,6 +111,31 @@ static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) return container_of(inode, struct zonefs_inode_info, i_vnode); } +static inline bool zonefs_zone_is_cnv(struct zonefs_zone *z) +{ + return z->z_flags & ZONEFS_ZONE_CNV; +} + +static inline bool zonefs_zone_is_seq(struct zonefs_zone *z) +{ + return !zonefs_zone_is_cnv(z); +} + +static inline struct zonefs_zone *zonefs_inode_zone(struct inode *inode) +{ + return inode->i_private; +} + +static inline bool zonefs_inode_is_cnv(struct inode *inode) +{ + return zonefs_zone_is_cnv(zonefs_inode_zone(inode)); +} + +static inline bool zonefs_inode_is_seq(struct inode *inode) +{ + return zonefs_zone_is_seq(zonefs_inode_zone(inode)); +} + /* * On-disk super block (block 0). */ @@ -181,7 +227,7 @@ struct zonefs_sb_info { uuid_t s_uuid; unsigned int s_zone_sectors_shift; - unsigned int s_nr_files[ZONEFS_ZTYPE_MAX]; + struct zonefs_zone_group s_zgroup[ZONEFS_ZTYPE_MAX]; loff_t s_blocks; loff_t s_used_blocks; @@ -209,6 +255,32 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) #define zonefs_warn(sb, format, args...) \ pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args) +/* In super.c */ +void zonefs_inode_account_active(struct inode *inode); +int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op); +void zonefs_i_size_write(struct inode *inode, loff_t isize); +void zonefs_update_stats(struct inode *inode, loff_t new_isize); +void __zonefs_io_error(struct inode *inode, bool write); + +static inline void zonefs_io_error(struct inode *inode, bool write) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + mutex_lock(&zi->i_truncate_mutex); + __zonefs_io_error(inode, write); + mutex_unlock(&zi->i_truncate_mutex); +} + +/* In super.c */ +extern const struct inode_operations zonefs_dir_inode_operations; +extern const struct file_operations zonefs_dir_operations; + +/* In file.c */ +extern const struct address_space_operations zonefs_file_aops; +extern const struct file_operations zonefs_file_operations; +int zonefs_file_truncate(struct inode *inode, loff_t isize); + +/* In sysfs.c */ int zonefs_sysfs_register(struct super_block *sb); void zonefs_sysfs_unregister(struct super_block *sb); int zonefs_sysfs_init(void); |