diff options
author | H. Peter Anvin <hpa@linux.intel.com> | 2013-04-20 09:16:44 -0700 |
---|---|---|
committer | H. Peter Anvin <hpa@linux.intel.com> | 2013-04-20 09:16:44 -0700 |
commit | f53f292eeaa234615c31a1306babe703fc4263f2 (patch) | |
tree | 707b0933a20f7dc05495e974243a23b5c9f8c918 /fs | |
parent | 15b9c359f288b09003cb70f7ed204affc0c6614d (diff) | |
parent | a9499fa7cd3fd4824a7202d00c766b269fa3bda6 (diff) |
Merge remote-tracking branch 'efi/chainsaw' into x86/efi
Resolved Conflicts:
drivers/firmware/efivars.c
fs/efivarsfs/file.c
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'fs')
585 files changed, 16959 insertions, 7899 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index 0a93dc1cb4ac..55abfd62654a 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -11,8 +11,7 @@ config 9P_FS if 9P_FS config 9P_FSCACHE - bool "Enable 9P client caching support (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "Enable 9P client caching support" depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y help Choose Y here to enable persistent, read-only local diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 15b679166201..7af425f53bee 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -23,6 +23,7 @@ #include "acl.h" #include "v9fs.h" #include "v9fs_vfs.h" +#include "fid.h" static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) { @@ -113,16 +114,12 @@ struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type) } -static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) +static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) { int retval; char *name; size_t size; void *buffer; - struct inode *inode = dentry->d_inode; - - set_cached_acl(inode, type, acl); - if (!acl) return 0; @@ -144,17 +141,16 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) default: BUG(); } - retval = v9fs_xattr_set(dentry, name, buffer, size, 0); + retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0); err_free_out: kfree(buffer); return retval; } -int v9fs_acl_chmod(struct dentry *dentry) +int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid) { int retval = 0; struct posix_acl *acl; - struct inode *inode = dentry->d_inode; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; @@ -163,25 +159,30 @@ int v9fs_acl_chmod(struct dentry *dentry) retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (retval) return retval; - retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, acl); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + retval = v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); } return retval; } -int v9fs_set_create_acl(struct dentry *dentry, - struct posix_acl **dpacl, struct posix_acl **pacl) +int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid, + struct posix_acl *dacl, struct posix_acl *acl) { - if (dentry) { - v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, *dpacl); - v9fs_set_acl(dentry, ACL_TYPE_ACCESS, *pacl); - } - posix_acl_release(*dpacl); - posix_acl_release(*pacl); - *dpacl = *pacl = NULL; + set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + v9fs_set_acl(fid, ACL_TYPE_DEFAULT, dacl); + v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl); return 0; } +void v9fs_put_acl(struct posix_acl *dacl, + struct posix_acl *acl) +{ + posix_acl_release(dacl); + posix_acl_release(acl); +} + int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl) { diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 559556411965..e4f7e882272b 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -17,27 +17,33 @@ #ifdef CONFIG_9P_FS_POSIX_ACL extern int v9fs_get_acl(struct inode *, struct p9_fid *); extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type); -extern int v9fs_acl_chmod(struct dentry *); -extern int v9fs_set_create_acl(struct dentry *, - struct posix_acl **, struct posix_acl **); +extern int v9fs_acl_chmod(struct inode *, struct p9_fid *); +extern int v9fs_set_create_acl(struct inode *, struct p9_fid *, + struct posix_acl *, struct posix_acl *); extern int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl); +extern void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl); #else #define v9fs_iop_get_acl NULL static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) { return 0; } -static inline int v9fs_acl_chmod(struct dentry *dentry) +static inline int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid) { return 0; } -static inline int v9fs_set_create_acl(struct dentry *dentry, - struct posix_acl **dpacl, - struct posix_acl **pacl) +static inline int v9fs_set_create_acl(struct inode *inode, + struct p9_fid *fid, + struct posix_acl *dacl, + struct posix_acl *acl) { return 0; } +static inline void v9fs_put_acl(struct posix_acl *dacl, + struct posix_acl *acl) +{ +} static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl) diff --git a/fs/9p/fid.c b/fs/9p/fid.c index da8eefbe830d..d51ec9fafcc8 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -41,29 +41,16 @@ * */ -int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) +static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid) { - struct v9fs_dentry *dent; - - p9_debug(P9_DEBUG_VFS, "fid %d dentry %s\n", - fid->fid, dentry->d_name.name); - - dent = dentry->d_fsdata; - if (!dent) { - dent = kmalloc(sizeof(struct v9fs_dentry), GFP_KERNEL); - if (!dent) - return -ENOMEM; - - spin_lock_init(&dent->lock); - INIT_LIST_HEAD(&dent->fidlist); - dentry->d_fsdata = dent; - } - - spin_lock(&dent->lock); - list_add(&fid->dlist, &dent->fidlist); - spin_unlock(&dent->lock); + hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata); +} - return 0; +void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) +{ + spin_lock(&dentry->d_lock); + __add_fid(dentry, fid); + spin_unlock(&dentry->d_lock); } /** @@ -74,24 +61,25 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) * */ -static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any) +static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) { - struct v9fs_dentry *dent; struct p9_fid *fid, *ret; p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", - dentry->d_name.name, dentry, uid, any); - dent = (struct v9fs_dentry *) dentry->d_fsdata; + dentry->d_name.name, dentry, from_kuid(&init_user_ns, uid), + any); ret = NULL; - if (dent) { - spin_lock(&dent->lock); - list_for_each_entry(fid, &dent->fidlist, dlist) { - if (any || fid->uid == uid) { + /* we'll recheck under lock if there's anything to look in */ + if (dentry->d_fsdata) { + struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata; + spin_lock(&dentry->d_lock); + hlist_for_each_entry(fid, h, dlist) { + if (any || uid_eq(fid->uid, uid)) { ret = fid; break; } } - spin_unlock(&dent->lock); + spin_unlock(&dentry->d_lock); } return ret; @@ -126,7 +114,7 @@ err_out: } static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, - uid_t uid, int any) + kuid_t uid, int any) { struct dentry *ds; char **wnames, *uname; @@ -214,8 +202,17 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, } kfree(wnames); fid_out: - if (!IS_ERR(fid)) - v9fs_fid_add(dentry, fid); + if (!IS_ERR(fid)) { + spin_lock(&dentry->d_lock); + if (d_unhashed(dentry)) { + spin_unlock(&dentry->d_lock); + p9_client_clunk(fid); + fid = ERR_PTR(-ENOENT); + } else { + __add_fid(dentry, fid); + spin_unlock(&dentry->d_lock); + } + } err_out: up_read(&v9ses->rename_sem); return fid; @@ -233,7 +230,7 @@ err_out: struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) { - uid_t uid; + kuid_t uid; int any, access; struct v9fs_session_info *v9ses; @@ -253,7 +250,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) break; default: - uid = ~0; + uid = INVALID_UID; any = 0; break; } @@ -272,7 +269,7 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry) return ret; } -static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid) +static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, kuid_t uid) { struct p9_fid *fid, *ret; @@ -289,7 +286,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry) int err; struct p9_fid *fid; - fid = v9fs_fid_clone_with_uid(dentry, 0); + fid = v9fs_fid_clone_with_uid(dentry, GLOBAL_ROOT_UID); if (IS_ERR(fid)) goto error_out; /* diff --git a/fs/9p/fid.h b/fs/9p/fid.h index bb0b6e7f58fc..2b6787fcb626 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -23,28 +23,8 @@ #define FS_9P_FID_H #include <linux/list.h> -/** - * struct v9fs_dentry - 9p private data stored in dentry d_fsdata - * @lock: protects the fidlist - * @fidlist: list of FIDs currently associated with this dentry - * - * This structure defines the 9p private data associated with - * a particular dentry. In particular, this private data is used - * to lookup which 9P FID handle should be used for a particular VFS - * operation. FID handles are associated with dentries instead of - * inodes in order to more closely map functionality to the Plan 9 - * expected behavior for FID reclaimation and tracking. - * - * See Also: Mapping FIDs to Linux VFS model in - * Design and Implementation of the Linux 9P File System documentation - */ -struct v9fs_dentry { - spinlock_t lock; /* protect fidlist */ - struct list_head fidlist; -}; - struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); struct p9_fid *v9fs_fid_clone(struct dentry *dentry); -int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); +void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); struct p9_fid *v9fs_writeback_fid(struct dentry *dentry); #endif diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index d934f04e7736..58e6cbce4156 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -161,7 +161,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) ret = r; continue; } - v9ses->dfltuid = option; + v9ses->dfltuid = make_kuid(current_user_ns(), option); + if (!uid_valid(v9ses->dfltuid)) { + p9_debug(P9_DEBUG_ERROR, + "uid field, but not a uid?\n"); + ret = -EINVAL; + continue; + } break; case Opt_dfltgid: r = match_int(&args[0], &option); @@ -171,7 +177,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) ret = r; continue; } - v9ses->dfltgid = option; + v9ses->dfltgid = make_kgid(current_user_ns(), option); + if (!gid_valid(v9ses->dfltgid)) { + p9_debug(P9_DEBUG_ERROR, + "gid field, but not a gid?\n"); + ret = -EINVAL; + continue; + } break; case Opt_afid: r = match_int(&args[0], &option); @@ -248,8 +260,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) else if (strcmp(s, "client") == 0) { v9ses->flags |= V9FS_ACCESS_CLIENT; } else { + uid_t uid; v9ses->flags |= V9FS_ACCESS_SINGLE; - v9ses->uid = simple_strtoul(s, &e, 10); + uid = simple_strtoul(s, &e, 10); if (*e != '\0') { ret = -EINVAL; pr_info("Unknown access argument %s\n", @@ -257,6 +270,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) kfree(s); goto free_and_return; } + v9ses->uid = make_kuid(current_user_ns(), uid); + if (!uid_valid(v9ses->uid)) { + ret = -EINVAL; + pr_info("Uknown uid %s\n", s); + kfree(s); + goto free_and_return; + } } kfree(s); @@ -319,7 +339,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, list_add(&v9ses->slist, &v9fs_sessionlist); spin_unlock(&v9fs_sessionlist_lock); - v9ses->uid = ~0; + v9ses->uid = INVALID_UID; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; @@ -364,7 +384,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->flags &= ~V9FS_ACCESS_MASK; v9ses->flags |= V9FS_ACCESS_ANY; - v9ses->uid = ~0; + v9ses->uid = INVALID_UID; } if (!v9fs_proto_dotl(v9ses) || !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { @@ -375,7 +395,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->flags &= ~V9FS_ACL_MASK; } - fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0, + fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID, v9ses->aname); if (IS_ERR(fid)) { retval = PTR_ERR(fid); @@ -387,7 +407,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE) fid->uid = v9ses->uid; else - fid->uid = ~0; + fid->uid = INVALID_UID; #ifdef CONFIG_9P_FSCACHE /* register the session for caching */ diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 34c59f14a1c9..a8e127c89627 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -109,9 +109,9 @@ struct v9fs_session_info { char *uname; /* user name to mount as */ char *aname; /* name of remote hierarchy being mounted */ unsigned int maxdata; /* max data for client interface */ - unsigned int dfltuid; /* default uid/muid for legacy support */ - unsigned int dfltgid; /* default gid for legacy support */ - u32 uid; /* if ACCESS_SINGLE, the uid that has access */ + kuid_t dfltuid; /* default uid/muid for legacy support */ + kgid_t dfltgid; /* default gid for legacy support */ + kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */ struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct backing_dev_info bdi; @@ -165,8 +165,8 @@ extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, #define V9FS_PORT 564 #define V9FS_DEFUSER "nobody" #define V9FS_DEFANAME "" -#define V9FS_DEFUID (-2) -#define V9FS_DEFGID (-2) +#define V9FS_DEFUID KUIDT_INIT(-2) +#define V9FS_DEFGID KGIDT_INIT(-2) static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) { diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 64600b5d0522..f039b104a98e 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -83,21 +83,12 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry) static void v9fs_dentry_release(struct dentry *dentry) { - struct v9fs_dentry *dent; - struct p9_fid *temp, *current_fid; - + struct hlist_node *p, *n; p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, dentry); - dent = dentry->d_fsdata; - if (dent) { - list_for_each_entry_safe(current_fid, temp, &dent->fidlist, - dlist) { - p9_client_clunk(current_fid); - } - - kfree(dent); - dentry->d_fsdata = NULL; - } + hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata) + p9_client_clunk(hlist_entry(p, struct p9_fid, dlist)); + dentry->d_fsdata = NULL; } static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) @@ -137,6 +128,7 @@ out_valid: const struct dentry_operations v9fs_cached_dentry_operations = { .d_revalidate = v9fs_lookup_revalidate, + .d_weak_revalidate = v9fs_lookup_revalidate, .d_delete = v9fs_cached_dentry_delete, .d_release = v9fs_dentry_release, }; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index ff911e779651..be1e34adc3c6 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -52,10 +52,9 @@ */ struct p9_rdir { - struct mutex mutex; int head; int tail; - uint8_t *buf; + uint8_t buf[]; }; /** @@ -93,33 +92,12 @@ static void p9stat_init(struct p9_wstat *stbuf) * */ -static int v9fs_alloc_rdir_buf(struct file *filp, int buflen) +static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen) { - struct p9_rdir *rdir; - struct p9_fid *fid; - int err = 0; - - fid = filp->private_data; - if (!fid->rdir) { - rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); - - if (rdir == NULL) { - err = -ENOMEM; - goto exit; - } - spin_lock(&filp->f_dentry->d_lock); - if (!fid->rdir) { - rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir); - mutex_init(&rdir->mutex); - rdir->head = rdir->tail = 0; - fid->rdir = (void *) rdir; - rdir = NULL; - } - spin_unlock(&filp->f_dentry->d_lock); - kfree(rdir); - } -exit: - return err; + struct p9_fid *fid = filp->private_data; + if (!fid->rdir) + fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); + return fid->rdir; } /** @@ -145,20 +123,16 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) buflen = fid->clnt->msize - P9_IOHDRSZ; - err = v9fs_alloc_rdir_buf(filp, buflen); - if (err) - goto exit; - rdir = (struct p9_rdir *) fid->rdir; + rdir = v9fs_alloc_rdir_buf(filp, buflen); + if (!rdir) + return -ENOMEM; - err = mutex_lock_interruptible(&rdir->mutex); - if (err) - return err; - while (err == 0) { + while (1) { if (rdir->tail == rdir->head) { err = v9fs_file_readn(filp, rdir->buf, NULL, buflen, filp->f_pos); if (err <= 0) - goto unlock_and_exit; + return err; rdir->head = 0; rdir->tail = err; @@ -169,9 +143,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) rdir->tail - rdir->head, &st); if (err) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); - err = -EIO; p9stat_free(&st); - goto unlock_and_exit; + return -EIO; } reclen = st.size+2; @@ -180,19 +153,13 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) p9stat_free(&st); - if (over) { - err = 0; - goto unlock_and_exit; - } + if (over) + return 0; + rdir->head += reclen; filp->f_pos += reclen; } } - -unlock_and_exit: - mutex_unlock(&rdir->mutex); -exit: - return err; } /** @@ -218,21 +185,16 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, buflen = fid->clnt->msize - P9_READDIRHDRSZ; - err = v9fs_alloc_rdir_buf(filp, buflen); - if (err) - goto exit; - rdir = (struct p9_rdir *) fid->rdir; + rdir = v9fs_alloc_rdir_buf(filp, buflen); + if (!rdir) + return -ENOMEM; - err = mutex_lock_interruptible(&rdir->mutex); - if (err) - return err; - - while (err == 0) { + while (1) { if (rdir->tail == rdir->head) { err = p9_client_readdir(fid, rdir->buf, buflen, filp->f_pos); if (err <= 0) - goto unlock_and_exit; + return err; rdir->head = 0; rdir->tail = err; @@ -245,8 +207,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, &curdirent); if (err < 0) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); - err = -EIO; - goto unlock_and_exit; + return -EIO; } /* d_off in dirent structure tracks the offset into @@ -261,20 +222,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, curdirent.d_type); oldoffset = curdirent.d_off; - if (over) { - err = 0; - goto unlock_and_exit; - } + if (over) + return 0; filp->f_pos = curdirent.d_off; rdir->head += err; } } - -unlock_and_exit: - mutex_unlock(&rdir->mutex); -exit: - return err; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index c2483e97beee..d384a8b77ee8 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -80,10 +80,6 @@ int v9fs_file_open(struct inode *inode, struct file *file) p9_client_clunk(fid); return err; } - if (file->f_flags & O_TRUNC) { - i_size_write(inode, 0); - inode->i_blocks = 0; - } if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))) generic_file_llseek(file, 0, SEEK_END); @@ -133,7 +129,7 @@ out_error: static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) { int res = 0; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); @@ -302,7 +298,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); int ret = -ENOLCK; p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", @@ -338,7 +334,7 @@ out_err: static int v9fs_file_flock_dotl(struct file *filp, int cmd, struct file_lock *fl) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); int ret = -ENOLCK; p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", @@ -529,7 +525,7 @@ v9fs_file_write(struct file *filp, const char __user * data, if (!count) goto out; - retval = v9fs_file_write_internal(filp->f_path.dentry->d_inode, + retval = v9fs_file_write_internal(file_inode(filp), filp->private_data, data, count, &origin, 1); /* update offset on successful write */ @@ -604,7 +600,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct v9fs_inode *v9inode; struct page *page = vmf->page; struct file *filp = vma->vm_file; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", @@ -620,6 +616,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) lock_page(page); if (page->mapping != inode->i_mapping) goto out_unlock; + wait_for_stable_page(page); return VM_FAULT_LOCKED; out_unlock: diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 890bed538f9b..d86edc8d3fd0 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -192,9 +192,6 @@ int v9fs_uflags2omode(int uflags, int extended) break; } - if (uflags & O_TRUNC) - ret |= P9_OTRUNC; - if (extended) { if (uflags & O_EXCL) ret |= P9_OEXCL; @@ -228,9 +225,9 @@ v9fs_blank_wstat(struct p9_wstat *wstat) wstat->uid = NULL; wstat->gid = NULL; wstat->muid = NULL; - wstat->n_uid = ~0; - wstat->n_gid = ~0; - wstat->n_muid = ~0; + wstat->n_uid = INVALID_UID; + wstat->n_gid = INVALID_GID; + wstat->n_muid = INVALID_UID; wstat->extension = NULL; } @@ -695,9 +692,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, "inode creation failed %d\n", err); goto error; } - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; + v9fs_fid_add(dentry, fid); d_instantiate(dentry, inode); } return ofid; @@ -793,7 +788,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, struct p9_fid *dfid, *fid; struct inode *inode; char *name; - int result = 0; p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n", dir, dentry->d_name.name, dentry, flags); @@ -811,13 +805,11 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, name = (char *) dentry->d_name.name; fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { - result = PTR_ERR(fid); - if (result == -ENOENT) { - inode = NULL; - goto inst_out; + if (fid == ERR_PTR(-ENOENT)) { + d_add(dentry, NULL); + return NULL; } - - return ERR_PTR(result); + return ERR_CAST(fid); } /* * Make sure we don't use a wrong inode due to parallel @@ -829,14 +821,9 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, else inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { - result = PTR_ERR(inode); - inode = NULL; - goto error; + p9_client_clunk(fid); + return ERR_CAST(inode); } - result = v9fs_fid_add(dentry, fid); - if (result < 0) - goto error_iput; -inst_out: /* * If we had a rename on the server and a parallel lookup * for the new name, then make sure we instantiate with @@ -845,15 +832,13 @@ inst_out: * k/b. */ res = d_materialise_unique(dentry, inode); - if (!IS_ERR(res)) - return res; - result = PTR_ERR(res); -error_iput: - iput(inode); -error: - p9_client_clunk(fid); - - return ERR_PTR(result); + if (!res) + v9fs_fid_add(dentry, fid); + else if (!IS_ERR(res)) + v9fs_fid_add(res, fid); + else + p9_client_clunk(fid); + return res; } static int diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 40895546e103..53687bbf2296 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -57,7 +57,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, * group of the new file system object. */ -static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) +static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) { BUG_ON(dir_inode == NULL); @@ -186,7 +186,6 @@ static int v9fs_mapped_dotl_flags(int flags) { O_CREAT, P9_DOTL_CREATE }, { O_EXCL, P9_DOTL_EXCL }, { O_NOCTTY, P9_DOTL_NOCTTY }, - { O_TRUNC, P9_DOTL_TRUNC }, { O_APPEND, P9_DOTL_APPEND }, { O_NONBLOCK, P9_DOTL_NONBLOCK }, { O_DSYNC, P9_DOTL_DSYNC }, @@ -246,7 +245,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, int *opened) { int err = 0; - gid_t gid; + kgid_t gid; umode_t mode; char *name = NULL; struct p9_qid qid; @@ -268,8 +267,14 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, } /* Only creates */ - if (!(flags & O_CREAT) || dentry->d_inode) - return finish_no_open(file, res); + if (!(flags & O_CREAT)) + return finish_no_open(file, res); + else if (dentry->d_inode) { + if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + return -EEXIST; + else + return finish_no_open(file, res); + } v9ses = v9fs_inode2v9ses(dir); @@ -325,13 +330,11 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; - d_instantiate(dentry, inode); - /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, &dacl, &pacl); + v9fs_set_create_acl(inode, fid, dacl, pacl); + + v9fs_fid_add(dentry, fid); + d_instantiate(dentry, inode); v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); @@ -364,6 +367,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, #endif *opened |= FILE_CREATED; out: + v9fs_put_acl(dacl, pacl); dput(res); return err; @@ -373,7 +377,6 @@ error: err_clunk_old_fid: if (ofid) p9_client_clunk(ofid); - v9fs_set_create_acl(NULL, &dacl, &pacl); goto out; } @@ -391,7 +394,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, int err; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; - gid_t gid; + kgid_t gid; char *name; umode_t mode; struct inode *inode; @@ -430,17 +433,17 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, if (err < 0) goto error; + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + /* instantiate inode and assign the unopened fid to the dentry */ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); - fid = NULL; - goto error; - } - inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -448,11 +451,11 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, err); goto error; } - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; + v9fs_fid_add(dentry, fid); + v9fs_set_create_acl(inode, fid, dacl, pacl); d_instantiate(dentry, inode); fid = NULL; + err = 0; } else { /* * Not in cached mode. No need to populate @@ -464,16 +467,15 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, err = PTR_ERR(inode); goto error; } + v9fs_set_create_acl(inode, fid, dacl, pacl); d_instantiate(dentry, inode); } - /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, &dacl, &pacl); inc_nlink(dir); v9fs_invalidate_inode_attr(dir); error: if (fid) p9_client_clunk(fid); - v9fs_set_create_acl(NULL, &dacl, &pacl); + v9fs_put_acl(dacl, pacl); return err; } @@ -567,10 +569,11 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_iattr_dotl p9attr; + struct inode *inode = dentry->d_inode; p9_debug(P9_DEBUG_VFS, "\n"); - retval = inode_change_ok(dentry->d_inode, iattr); + retval = inode_change_ok(inode, iattr); if (retval) return retval; @@ -591,23 +594,23 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) return PTR_ERR(fid); /* Write all dirty data */ - if (S_ISREG(dentry->d_inode->i_mode)) - filemap_write_and_wait(dentry->d_inode->i_mapping); + if (S_ISREG(inode->i_mode)) + filemap_write_and_wait(inode->i_mapping); retval = p9_client_setattr(fid, &p9attr); if (retval < 0) return retval; if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(dentry->d_inode)) - truncate_setsize(dentry->d_inode, iattr->ia_size); + iattr->ia_size != i_size_read(inode)) + truncate_setsize(inode, iattr->ia_size); - v9fs_invalidate_inode_attr(dentry->d_inode); - setattr_copy(dentry->d_inode, iattr); - mark_inode_dirty(dentry->d_inode); + v9fs_invalidate_inode_attr(inode); + setattr_copy(inode, iattr); + mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { /* We also want to update ACL when we update mode bits */ - retval = v9fs_acl_chmod(dentry); + retval = v9fs_acl_chmod(inode, fid); if (retval < 0) return retval; } @@ -692,7 +695,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, const char *symname) { int err; - gid_t gid; + kgid_t gid; char *name; struct p9_qid qid; struct inode *inode; @@ -741,11 +744,10 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, err); goto error; } - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; + v9fs_fid_add(dentry, fid); d_instantiate(dentry, inode); fid = NULL; + err = 0; } else { /* Not in cached mode. No need to populate inode with stat */ inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0); @@ -832,7 +834,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev) { int err; - gid_t gid; + kgid_t gid; char *name; umode_t mode; struct v9fs_session_info *v9ses; @@ -875,17 +877,17 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, goto error; v9fs_invalidate_inode_attr(dir); + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + /* instantiate inode and assign the unopened fid to the dentry */ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); - fid = NULL; - goto error; - } - inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -893,11 +895,11 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, err); goto error; } - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; + v9fs_set_create_acl(inode, fid, dacl, pacl); + v9fs_fid_add(dentry, fid); d_instantiate(dentry, inode); fid = NULL; + err = 0; } else { /* * Not in cached mode. No need to populate inode with stat. @@ -908,14 +910,13 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, err = PTR_ERR(inode); goto error; } + v9fs_set_create_acl(inode, fid, dacl, pacl); d_instantiate(dentry, inode); } - /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, &dacl, &pacl); error: if (fid) p9_client_clunk(fid); - v9fs_set_create_acl(NULL, &dacl, &pacl); + v9fs_put_acl(dacl, pacl); return err; } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 137d50396898..2756dcd5de6e 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -363,5 +363,6 @@ struct file_system_type v9fs_fs_type = { .mount = v9fs_mount, .kill_sb = v9fs_kill_super, .owner = THIS_MODULE, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT, + .fs_flags = FS_RENAME_DOES_D_MOVE, }; +MODULE_ALIAS_FS("9p"); diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 29653b70a9c3..c45e016b190f 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -111,19 +111,26 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, int v9fs_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t value_len, int flags) { + struct p9_fid *fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + return v9fs_fid_xattr_set(fid, name, value, value_len, flags); +} + +int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, + const void *value, size_t value_len, int flags) +{ u64 offset = 0; int retval, msize, write_count; - struct p9_fid *fid = NULL; p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n", name, value_len, flags); - fid = v9fs_fid_clone(dentry); - if (IS_ERR(fid)) { - retval = PTR_ERR(fid); - fid = NULL; - goto error; - } + /* Clone it */ + fid = p9_client_walk(fid, 0, NULL, 1); + if (IS_ERR(fid)) + return PTR_ERR(fid); + /* * On success fid points to xattr */ @@ -131,7 +138,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name, if (retval < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n", retval); - goto error; + p9_client_clunk(fid); + return retval; } msize = fid->clnt->msize; while (value_len) { @@ -144,17 +152,12 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name, if (write_count < 0) { /* error in xattr write */ retval = write_count; - goto error; + break; } offset += write_count; value_len -= write_count; } - /* Total read xattr bytes */ - retval = offset; -error: - if (fid) - retval = p9_client_clunk(fid); - return retval; + return p9_client_clunk(fid); } ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h index eaa837c53bd5..eec348a3df71 100644 --- a/fs/9p/xattr.h +++ b/fs/9p/xattr.h @@ -27,6 +27,8 @@ extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *, void *, size_t); extern ssize_t v9fs_xattr_get(struct dentry *, const char *, void *, size_t); +extern int v9fs_fid_xattr_set(struct p9_fid *, const char *, + const void *, size_t, int); extern int v9fs_xattr_set(struct dentry *, const char *, const void *, size_t, int); extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t); diff --git a/fs/Kconfig b/fs/Kconfig index 780725a463b1..c229f828eb01 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -211,6 +211,7 @@ source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" source "fs/f2fs/Kconfig" +source "fs/efivarfs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 9d53192236fc..0fde6a369a72 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -127,3 +127,4 @@ obj-$(CONFIG_F2FS_FS) += f2fs/ obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ +obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index e55182a74605..c5a7787dd5e9 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -1,6 +1,6 @@ config ADFS_FS - tristate "ADFS file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + tristate "ADFS file system support" + depends on BLOCK help The Acorn Disc Filing System is the standard file system of the RiscOS operating system which runs on Acorn's ARM-based Risc PC diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index b3be2e7c5643..9cf874ce8336 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -19,7 +19,7 @@ static DEFINE_RWLOCK(adfs_dir_lock); static int adfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct object_info obj; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index d57122935793..0ff4bae2c2a2 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -524,6 +524,7 @@ static struct file_system_type adfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("adfs"); static int __init init_adfs_fs(void) { diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index cfad9afb4762..a04d9e848d05 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -1,6 +1,6 @@ config AFFS_FS - tristate "Amiga FFS file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + tristate "Amiga FFS file system support" + depends on BLOCK help The Fast File System (FFS) is the common file system used on hard disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index eb82ee53ee0b..d9a43674cb94 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -125,9 +125,8 @@ static void affs_fix_dcache(struct inode *inode, u32 entry_ino) { struct dentry *dentry; - struct hlist_node *p; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { if (entry_ino == (u32)(long)dentry->d_fsdata) { dentry->d_fsdata = (void *)inode->i_ino; break; diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 8ca8f3a55599..fd11a6d608ee 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -42,7 +42,7 @@ const struct inode_operations affs_dir_inode_operations = { static int affs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; struct buffer_head *dir_bh; struct buffer_head *fh_bh; diff --git a/fs/affs/super.c b/fs/affs/super.c index b84dc7352502..45161a832bbc 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -622,6 +622,7 @@ static struct file_system_type affs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("affs"); static int __init init_affs_fs(void) { diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 8f975f25b486..ebba3b18e5da 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -1,6 +1,6 @@ config AFS_FS - tristate "Andrew File System support (AFS) (EXPERIMENTAL)" - depends on INET && EXPERIMENTAL + tristate "Andrew File System support (AFS)" + depends on INET select AF_RXRPC select DNS_RESOLVER help @@ -22,8 +22,7 @@ config AFS_DEBUG If unsure, say N. config AFS_FSCACHE - bool "Provide AFS client caching support (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "Provide AFS client caching support" depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y help Say Y here if you want AFS data to be cached locally on disk through diff --git a/fs/afs/afs.h b/fs/afs/afs.h index c548aa346f0d..3c462ff6db63 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -119,8 +119,8 @@ struct afs_file_status { u64 size; /* file size */ afs_dataversion_t data_version; /* current data version */ u32 author; /* author ID */ - u32 owner; /* owner ID */ - u32 group; /* group ID */ + kuid_t owner; /* owner ID */ + kgid_t group; /* group ID */ afs_access_t caller_access; /* access rights for authenticated caller */ afs_access_t anon_access; /* access rights for unauthenticated caller */ umode_t mode; /* UNIX mode */ @@ -133,13 +133,6 @@ struct afs_file_status { /* * AFS file status change request */ -struct afs_store_status { - u32 mask; /* which bits of the struct are set */ - u32 mtime_client; /* last time client changed data */ - u32 owner; /* owner ID */ - u32 group; /* group ID */ - umode_t mode; /* UNIX mode */ -}; #define AFS_SET_MTIME 0x01 /* set the mtime */ #define AFS_SET_OWNER 0x02 /* set the owner ID */ diff --git a/fs/afs/dir.c b/fs/afs/dir.c index db477906ba4f..7a465ed04444 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -393,12 +393,12 @@ static int afs_readdir(struct file *file, void *cookie, filldir_t filldir) int ret; _enter("{%Ld,{%lu}}", - file->f_pos, file->f_path.dentry->d_inode->i_ino); + file->f_pos, file_inode(file)->i_ino); ASSERT(file->private_data != NULL); fpos = file->f_pos; - ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos, + ret = afs_dir_iterate(file_inode(file), &fpos, cookie, filldir, file->private_data); file->f_pos = fpos; diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 757d664575dd..2497bf306c70 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -514,7 +514,7 @@ error: */ int afs_lock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}", vnode->fid.vid, vnode->fid.vnode, cmd, @@ -537,7 +537,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) */ int afs_flock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); _enter("{%x:%u},%d,{t=%x,fl=%x}", vnode->fid.vid, vnode->fid.vnode, cmd, diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index b960ff05ea0b..c2e930ec2888 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -42,6 +42,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, umode_t mode; u64 data_version, size; u32 changed = 0; /* becomes non-zero if ctime-type changes seen */ + kuid_t owner; + kgid_t group; #define EXTRACT(DST) \ do { \ @@ -56,7 +58,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, size = ntohl(*bp++); data_version = ntohl(*bp++); EXTRACT(status->author); - EXTRACT(status->owner); + owner = make_kuid(&init_user_ns, ntohl(*bp++)); + changed |= !uid_eq(owner, status->owner); + status->owner = owner; EXTRACT(status->caller_access); /* call ticket dependent */ EXTRACT(status->anon_access); EXTRACT(status->mode); @@ -65,7 +69,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, bp++; /* seg size */ status->mtime_client = ntohl(*bp++); status->mtime_server = ntohl(*bp++); - EXTRACT(status->group); + group = make_kgid(&init_user_ns, ntohl(*bp++)); + changed |= !gid_eq(group, status->group); + status->group = group; bp++; /* sync counter */ data_version |= (u64) ntohl(*bp++) << 32; EXTRACT(status->lock_count); @@ -181,12 +187,12 @@ static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr) if (attr->ia_valid & ATTR_UID) { mask |= AFS_SET_OWNER; - owner = attr->ia_uid; + owner = from_kuid(&init_user_ns, attr->ia_uid); } if (attr->ia_valid & ATTR_GID) { mask |= AFS_SET_GROUP; - group = attr->ia_gid; + group = from_kgid(&init_user_ns, attr->ia_gid); } if (attr->ia_valid & ATTR_MODE) { diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 95cffd38239f..789bc253b5f6 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -69,7 +69,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) set_nlink(inode, vnode->status.nlink); inode->i_uid = vnode->status.owner; - inode->i_gid = 0; + inode->i_gid = GLOBAL_ROOT_GID; inode->i_size = vnode->status.size; inode->i_ctime.tv_sec = vnode->status.mtime_server; inode->i_ctime.tv_nsec = 0; @@ -175,8 +175,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_op = &afs_autocell_inode_operations; set_nlink(inode, 2); - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; inode->i_ctime.tv_sec = get_seconds(); inode->i_ctime.tv_nsec = 0; inode->i_atime = inode->i_mtime = inode->i_ctime; diff --git a/fs/afs/super.c b/fs/afs/super.c index 43165009428d..c4861557e385 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -24,6 +24,8 @@ #include <linux/parser.h> #include <linux/statfs.h> #include <linux/sched.h> +#include <linux/nsproxy.h> +#include <net/net_namespace.h> #include "internal.h" #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ @@ -43,6 +45,7 @@ struct file_system_type afs_fs_type = { .kill_sb = afs_kill_super, .fs_flags = 0, }; +MODULE_ALIAS_FS("afs"); static const struct super_operations afs_super_ops = { .statfs = afs_statfs, @@ -363,6 +366,10 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, memset(¶ms, 0, sizeof(params)); + ret = -EINVAL; + if (current->nsproxy->net_ns != &init_net) + goto error; + /* parse the options and device name */ if (options) { ret = afs_parse_options(¶ms, options, &dev_name); diff --git a/fs/afs/write.c b/fs/afs/write.c index 9aa52d93c73c..7e03eadb40c0 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -120,7 +120,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct afs_writeback *candidate, *wb; - struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct page *page; struct key *key = file->private_data; unsigned from = pos & (PAGE_CACHE_SIZE - 1); @@ -245,7 +245,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); loff_t i_size, maybe_i_size; _enter("{%x:%u},{%lx}", @@ -627,8 +627,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct dentry *dentry = iocb->ki_filp->f_path.dentry; - struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); ssize_t result; size_t count = iov_length(iov, nr_segs); @@ -101,7 +101,7 @@ static int aio_setup_ring(struct kioctx *ctx) struct aio_ring *ring; struct aio_ring_info *info = &ctx->ring_info; unsigned nr_events = ctx->max_reqs; - unsigned long size; + unsigned long size, populate; int nr_pages; /* Compensate for the ring buffer's head/tail overlap entry */ @@ -129,7 +129,8 @@ static int aio_setup_ring(struct kioctx *ctx) down_write(&ctx->mm->mmap_sem); info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0); + MAP_ANONYMOUS|MAP_PRIVATE, 0, + &populate); if (IS_ERR((void *)info->mmap_base)) { up_write(&ctx->mm->mmap_sem); info->mmap_size = 0; @@ -147,6 +148,8 @@ static int aio_setup_ring(struct kioctx *ctx) aio_free_ring(ctx); return -EAGAIN; } + if (populate) + mm_populate(info->mmap_base, populate); ctx->user_id = info->mmap_base; @@ -588,11 +591,10 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; - struct hlist_node *n; rcu_read_lock(); - hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { + hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { /* * RCU protects us against accessing freed memory but * we have to be careful not to get a reference when the diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 28d39fb84ae3..47a65df8c871 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -131,7 +131,6 @@ struct file *anon_inode_getfile(const char *name, struct qstr this; struct path path; struct file *file; - int error; if (IS_ERR(anon_inode_inode)) return ERR_PTR(-ENODEV); @@ -143,7 +142,7 @@ struct file *anon_inode_getfile(const char *name, * Link the inode to a directory entry by creating a unique name * using the inode sequence number. */ - error = -ENOMEM; + file = ERR_PTR(-ENOMEM); this.name = name; this.len = strlen(name); this.hash = 0; @@ -160,15 +159,12 @@ struct file *anon_inode_getfile(const char *name, d_instantiate(path.dentry, anon_inode_inode); - error = -ENFILE; file = alloc_file(&path, OPEN_FMODE(flags), fops); - if (!file) + if (IS_ERR(file)) goto err_dput; file->f_mapping = anon_inode_inode->i_mapping; - file->f_pos = 0; file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); - file->f_version = 0; file->private_data = priv; return file; @@ -177,7 +173,7 @@ err_dput: path_put(&path); err_module: module_put(fops->owner); - return ERR_PTR(error); + return file; } EXPORT_SYMBOL_GPL(anon_inode_getfile); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index b785e7707959..3f1128b37e46 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -273,7 +273,7 @@ static inline int autofs_prepare_pipe(struct file *pipe) { if (!pipe->f_op || !pipe->f_op->write) return -EINVAL; - if (!S_ISFIFO(pipe->f_dentry->d_inode->i_mode)) + if (!S_ISFIFO(file_inode(pipe)->i_mode)) return -EINVAL; /* We want a packet pipe */ pipe->f_flags |= O_DIRECT; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 9f68a37bb2b2..743c7c2c949d 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -159,7 +159,7 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) struct inode *inode; if (f) { - inode = f->f_path.dentry->d_inode; + inode = file_inode(f); sbi = autofs4_sbi(inode->i_sb); } return sbi; diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index cddc74b9cdb2..b3db517e89ec 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -26,6 +26,7 @@ static struct file_system_type autofs_fs_type = { .mount = autofs_mount, .kill_sb = autofs4_kill_sb, }; +MODULE_ALIAS_FS("autofs"); static int __init init_autofs4_fs(void) { diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index c93447604da8..9bd16255dd9c 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -383,8 +383,10 @@ static struct vfsmount *autofs4_d_automount(struct path *path) goto done; } } else { - if (!simple_empty(dentry)) + if (!simple_empty(dentry)) { + spin_unlock(&sbi->fs_lock); goto done; + } } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); @@ -587,7 +589,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) /* This allows root to remove symlinks */ if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; if (atomic_dec_and_test(&ino->count)) { p_ino = autofs4_dentry_ino(dentry->d_parent); @@ -874,7 +876,7 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, static long autofs4_root_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); } @@ -882,7 +884,7 @@ static long autofs4_root_ioctl(struct file *filp, static long autofs4_root_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); int ret; if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 03bc1d347d8e..3db70dae40d3 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -42,10 +42,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) while (wq) { nwq = wq->next; wq->status = -ENOENT; /* Magic is gone - report failure */ - if (wq->name.name) { - kfree(wq->name.name); - wq->name.name = NULL; - } + kfree(wq->name.name); + wq->name.name = NULL; wq->wait_ctr--; wake_up_interruptible(&wq->queue); wq = nwq; diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig index 7835d30f211f..edc5cc2aefad 100644 --- a/fs/befs/Kconfig +++ b/fs/befs/Kconfig @@ -1,6 +1,6 @@ config BEFS_FS - tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + tristate "BeOS file system (BeFS) support (read only)" + depends on BLOCK select NLS help The BeOS File System (BeFS) is the native file system of Be, Inc's diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 2b3bda8d5e68..8615ee89ab55 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -213,7 +213,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) static int befs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; befs_off_t value; @@ -951,6 +951,7 @@ static struct file_system_type befs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("befs"); static int __init init_befs_fs(void) diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index c2336c62024f..3728a6479c64 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -1,6 +1,6 @@ config BFS_FS - tristate "BFS file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + tristate "BFS file system support" + depends on BLOCK help Boot File System (BFS) is a file system used under SCO UnixWare to allow the bootloader access to the kernel image and other important diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 2785ef91191a..3f422f6bb5ca 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -28,7 +28,7 @@ static struct buffer_head *bfs_find_entry(struct inode *dir, static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) { - struct inode *dir = f->f_path.dentry->d_inode; + struct inode *dir = file_inode(f); struct buffer_head *bh; struct bfs_dirent *de; struct bfs_sb_info *info = BFS_SB(dir->i_sb); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 737aaa3f7090..5e376bb93419 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -473,6 +473,7 @@ static struct file_system_type bfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("bfs"); static int __init init_bfs_fs(void) { diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 6043567b95c2..bbc8f8827eac 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -214,7 +214,7 @@ static int load_aout_binary(struct linux_binprm * bprm) if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || N_TRSIZE(ex) || N_DRSIZE(ex) || - i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { return -ENOEXEC; } @@ -367,7 +367,7 @@ static int load_aout_library(struct file *file) int retval; struct exec ex; - inode = file->f_path.dentry->d_inode; + inode = file_inode(file); retval = -ENOEXEC; error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0c42cdbabecf..3939829f6c5c 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -33,6 +33,7 @@ #include <linux/elf.h> #include <linux/utsname.h> #include <linux/coredump.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include <asm/param.h> #include <asm/page.h> @@ -321,6 +322,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return 0; } +#ifndef elf_map + static unsigned long elf_map(struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) @@ -355,6 +358,8 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } +#endif /* !elf_map */ + static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) { int i, first_idx = -1, last_idx = -1; @@ -1140,7 +1145,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, /* By default, dump shared memory if mapped from an anonymous file. */ if (vma->vm_flags & VM_SHARED) { - if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ? + if (file_inode(vma->vm_file)->i_nlink == 0 ? FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) goto whole; return 0; @@ -1248,7 +1253,7 @@ static int writenote(struct memelfnote *men, struct file *file, #undef DUMP_WRITE static void fill_elf_header(struct elfhdr *elf, int segs, - u16 machine, u32 flags, u8 osabi) + u16 machine, u32 flags) { memset(elf, 0, sizeof(*elf)); @@ -1320,8 +1325,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus, cputime_to_timeval(cputime.utime, &prstatus->pr_utime); cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { - cputime_to_timeval(p->utime, &prstatus->pr_utime); - cputime_to_timeval(p->stime, &prstatus->pr_stime); + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + cputime_to_timeval(utime, &prstatus->pr_utime); + cputime_to_timeval(stime, &prstatus->pr_stime); } cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); @@ -1630,7 +1638,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, * Initialize the ELF file header. */ fill_elf_header(elf, phdrs, - view->e_machine, view->e_flags, view->ei_osabi); + view->e_machine, view->e_flags); /* * Allocate a structure for each thread. @@ -1870,7 +1878,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, elf_core_copy_regs(&info->prstatus->pr_reg, regs); /* Set up header */ - fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI); + fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); /* * Set up the notes in similar form to SVR4 core dumps made diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index dc84732e554f..9c13e023e2b7 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -909,7 +909,7 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, dynamic_error: printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n", - what, file->f_path.dentry->d_inode->i_ino); + what, file_inode(file)->i_ino); return -ELIBBAD; } @@ -1219,7 +1219,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) /* By default, dump shared memory if mapped from an anonymous file. */ if (vma->vm_flags & VM_SHARED) { - if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0) { + if (file_inode(vma->vm_file)->i_nlink == 0) { dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags); kdcore("%08lx: %08lx: %s (share)", vma->vm_start, vma->vm_flags, dump_ok ? "yes" : "no"); @@ -1375,8 +1375,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus, cputime_to_timeval(cputime.utime, &prstatus->pr_utime); cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { - cputime_to_timeval(p->utime, &prstatus->pr_utime); - cputime_to_timeval(p->stime, &prstatus->pr_stime); + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + cputime_to_timeval(utime, &prstatus->pr_utime); + cputime_to_timeval(stime, &prstatus->pr_stime); } cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index b56371981d16..2036d21baaef 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -438,7 +438,7 @@ static int load_flat_file(struct linux_binprm * bprm, int ret; hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */ - inode = bprm->file->f_path.dentry->d_inode; + inode = file_inode(bprm->file); text_len = ntohl(hdr->data_start); data_len = ntohl(hdr->data_end) - ntohl(hdr->data_start); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 0c8869fdd14e..751df5e4f61a 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -531,7 +531,7 @@ static void kill_node(Node *e) static ssize_t bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos) { - Node *e = file->f_path.dentry->d_inode->i_private; + Node *e = file_inode(file)->i_private; ssize_t res; char *page; @@ -550,7 +550,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct dentry *root; - Node *e = file->f_path.dentry->d_inode->i_private; + Node *e = file_inode(file)->i_private; int res = parse_command(buffer, count); switch (res) { @@ -720,6 +720,7 @@ static struct file_system_type bm_fs_type = { .mount = bm_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("binfmt_misc"); static int __init init_misc_binfmt(void) { @@ -1428,6 +1428,8 @@ void bio_endio(struct bio *bio, int error) else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; + trace_block_bio_complete(bio, error); + if (bio->bi_end_io) bio->bi_end_io(bio, error); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 172f8491a2bd..aea605c98ba6 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -318,7 +318,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, /* * private llseek: - * for a block special file file->f_path.dentry->d_inode->i_size is zero + * for a block special file file_inode(file)->i_size is zero * so we compute the size by hand (just as in block_read/write above) */ static loff_t block_llseek(struct file *file, loff_t offset, int whence) @@ -994,6 +994,7 @@ int revalidate_disk(struct gendisk *disk) mutex_lock(&bdev->bd_mutex); check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; mutex_unlock(&bdev->bd_mutex); bdput(bdev); return ret; @@ -1032,7 +1033,9 @@ void bd_set_size(struct block_device *bdev, loff_t size) { unsigned bsize = bdev_logical_block_size(bdev); - bdev->bd_inode->i_size = size; + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, size); + mutex_unlock(&bdev->bd_inode->i_mutex); while (bsize < PAGE_CACHE_SIZE) { if (size & bsize) break; @@ -1117,7 +1120,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret && !bdev->bd_openers) { + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); bdi = blk_get_backing_dev_info(bdev); if (bdi == NULL) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index d33f01c08b60..9a8622a5b867 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,11 +1,13 @@ config BTRFS_FS - tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" - depends on EXPERIMENTAL + tristate "Btrfs filesystem Unstable disk format" select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS select LZO_DECOMPRESS + select RAID6_PQ + select XOR_BLOCKS + help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7df3e0f0ee51..3932224f99e9 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o qgroup.o send.o dev-replace.o + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04edf69be875..bd605c87adfd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -352,11 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, err = __resolve_indirect_ref(fs_info, search_commit_root, time_seq, ref, parents, extent_item_pos); - if (err) { - if (ret == 0) - ret = err; + if (err) continue; - } /* we put the first parent into the ref at hand */ ULIST_ITER_INIT(&uiter); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index d61feca79455..310a7f6d09b1 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -19,7 +19,7 @@ #ifndef __BTRFS_BACKREF__ #define __BTRFS_BACKREF__ -#include "ioctl.h" +#include <linux/btrfs.h> #include "ulist.h" #include "extent_io.h" diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2a8c242bc4f5..d9b97d4960e6 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -40,6 +40,8 @@ #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 #define BTRFS_INODE_NEEDS_FULL_SYNC 7 #define BTRFS_INODE_COPY_EVERYTHING 8 +#define BTRFS_INODE_IN_DELALLOC_LIST 9 +#define BTRFS_INODE_READDIO_NEED_LOCK 10 /* in memory btrfs inode */ struct btrfs_inode { @@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) return 0; } +/* + * Disable DIO read nolock optimization, so new dio readers will be forced + * to grab i_mutex. It is used to avoid the endless truncate due to + * nonlocked dio read. + */ +static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) +{ + set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); + smp_mb(); +} + +static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb__before_clear_bit(); + clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags); +} + #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 11d47bfb62b4..18af6f48781a 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror( (bh->b_data + (dev_bytenr & 4095)); if (btrfs_super_bytenr(super_tmp) != dev_bytenr || - strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, - sizeof(super_tmp->magic)) || + super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) || memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || btrfs_super_nodesize(super_tmp) != state->metablock_size || btrfs_super_leafsize(super_tmp) != state->metablock_size || diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 94ab2f80e7e3..15b94089abc4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_size) - ret = io_tree->ops->merge_bio_hook(page, 0, + ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, PAGE_CACHE_SIZE, bio, 0); else @@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->index = em_start >> PAGE_CACHE_SHIFT; if (comp_bio->bi_size) - ret = tree->ops->merge_bio_hook(page, 0, + ret = tree->ops->merge_bio_hook(READ, page, 0, PAGE_CACHE_SIZE, comp_bio, 0); else diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index eea5da7a2b9a..ca9d8f1a3bb6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -651,6 +651,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, NULL)) return 0; + __tree_mod_log_free_eb(fs_info, old_root); + ret = tree_mod_alloc(fs_info, flags, &tm); if (ret < 0) goto out; @@ -736,7 +738,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) static noinline void tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items) + unsigned long src_offset, int nr_items, int log_removal) { int ret; int i; @@ -750,10 +752,12 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, } for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); + if (log_removal) { + ret = tree_mod_log_insert_key_locked(fs_info, src, + i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + } ret = tree_mod_log_insert_key_locked(fs_info, dst, i + dst_offset, MOD_LOG_KEY_ADD); @@ -927,7 +931,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } - tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -1046,6 +1049,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); + tree_mod_log_free_eb(root->fs_info, buf); btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1138,6 +1142,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, switch (tm->op) { case MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); + /* Fallthrough */ case MOD_LOG_KEY_REMOVE_WHILE_MOVING: case MOD_LOG_KEY_REMOVE: btrfs_set_node_key(eb, &tm->key, tm->slot); @@ -1222,7 +1227,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, __tree_mod_log_rewind(eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > - BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root)); + BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); return eb_rewin; } @@ -1441,7 +1446,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) */ int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, int cache_only, u64 *last_ret, + int start_slot, u64 *last_ret, struct btrfs_key *progress) { struct extent_buffer *cur; @@ -1461,8 +1466,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; parent_level = btrfs_header_level(parent); - if (cache_only && parent_level != 1) - return 0; WARN_ON(trans->transaction != root->fs_info->running_transaction); WARN_ON(trans->transid != root->fs_info->generation); @@ -1508,10 +1511,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, else uptodate = 0; if (!cur || !uptodate) { - if (cache_only) { - free_extent_buffer(cur); - continue; - } if (!cur) { cur = read_tree_block(root, blocknr, blocksize, gen); @@ -1755,7 +1754,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - tree_mod_log_free_eb(root->fs_info, root->node); tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); @@ -3000,7 +2998,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3071,7 +3069,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, sizeof(struct btrfs_key_ptr)); tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + src_nritems - push_items, push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3223,12 +3221,18 @@ static noinline int split_node(struct btrfs_trans_handle *trans, int mid; int ret; u32 c_nritems; + int tree_mod_log_removal = 1; c = path->nodes[level]; WARN_ON(btrfs_header_generation(c) != trans->transid); if (c == root->node) { /* trying to split the root, lets make a new one */ ret = insert_new_root(trans, root, path, level + 1); + /* + * removal of root nodes has been logged by + * tree_mod_log_set_root_pointer due to locking + */ + tree_mod_log_removal = 0; if (ret) return ret; } else { @@ -3266,7 +3270,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid, + tree_mod_log_removal); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), @@ -4825,8 +4830,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) /* * A helper function to walk down the tree starting at min_key, and looking - * for nodes or leaves that are either in cache or have a minimum - * transaction id. This is used by the btree defrag code, and tree logging + * for nodes or leaves that are have a minimum transaction id. + * This is used by the btree defrag code, and tree logging * * This does not cow, but it does stuff the starting key it finds back * into min_key, so you can call btrfs_search_slot with cow=1 on the @@ -4847,7 +4852,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) */ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_key *max_key, - struct btrfs_path *path, int cache_only, + struct btrfs_path *path, u64 min_trans) { struct extent_buffer *cur; @@ -4887,15 +4892,12 @@ again: if (sret && slot > 0) slot--; /* - * check this node pointer against the cache_only and - * min_trans parameters. If it isn't in cache or is too - * old, skip to the next one. + * check this node pointer against the min_trans parameters. + * If it is too old, old, skip to the next one. */ while (slot < nritems) { u64 blockptr; u64 gen; - struct extent_buffer *tmp; - struct btrfs_disk_key disk_key; blockptr = btrfs_node_blockptr(cur, slot); gen = btrfs_node_ptr_generation(cur, slot); @@ -4903,27 +4905,7 @@ again: slot++; continue; } - if (!cache_only) - break; - - if (max_key) { - btrfs_node_key(cur, &disk_key, slot); - if (comp_keys(&disk_key, max_key) >= 0) { - ret = 1; - goto out; - } - } - - tmp = btrfs_find_tree_block(root, blockptr, - btrfs_level_size(root, level - 1)); - - if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) { - free_extent_buffer(tmp); - break; - } - if (tmp) - free_extent_buffer(tmp); - slot++; + break; } find_next_key: /* @@ -4934,7 +4916,7 @@ find_next_key: path->slots[level] = slot; btrfs_set_path_blocking(path); sret = btrfs_find_next_key(root, path, min_key, level, - cache_only, min_trans); + min_trans); if (sret == 0) { btrfs_release_path(path); goto again; @@ -5399,8 +5381,7 @@ out: /* * this is similar to btrfs_next_leaf, but does not try to preserve * and fixup the path. It looks for and returns the next key in the - * tree based on the current path and the cache_only and min_trans - * parameters. + * tree based on the current path and the min_trans parameters. * * 0 is returned if another key is found, < 0 if there are any errors * and 1 is returned if there are no higher keys in the tree @@ -5409,8 +5390,7 @@ out: * calling this function. */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, int level, - int cache_only, u64 min_trans) + struct btrfs_key *key, int level, u64 min_trans) { int slot; struct extent_buffer *c; @@ -5461,22 +5441,8 @@ next: if (level == 0) btrfs_item_key_to_cpu(c, key, slot); else { - u64 blockptr = btrfs_node_blockptr(c, slot); u64 gen = btrfs_node_ptr_generation(c, slot); - if (cache_only) { - struct extent_buffer *cur; - cur = btrfs_find_tree_block(root, blockptr, - btrfs_level_size(root, level - 1)); - if (!cur || - btrfs_buffer_uptodate(cur, gen, 1) <= 0) { - slot++; - if (cur) - free_extent_buffer(cur); - goto next; - } - free_extent_buffer(cur); - } if (gen < min_trans) { slot++; goto next; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 547b7b05727f..0d82922179db 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -31,10 +31,10 @@ #include <trace/events/btrfs.h> #include <asm/kmap_types.h> #include <linux/pagemap.h> +#include <linux/btrfs.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" -#include "ioctl.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; -#define BTRFS_MAGIC "_BHRfS_M" +#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ #define BTRFS_MAX_MIRRORS 3 @@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 }; /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) +#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. @@ -336,7 +338,10 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) /* * File system states */ +#define BTRFS_FS_STATE_ERROR 0 +#define BTRFS_FS_STATE_REMOUNTING 1 +/* Super block flags */ /* Errors detected */ #define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) @@ -502,6 +507,7 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) +#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL @@ -511,6 +517,7 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) /* @@ -952,8 +959,20 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE -#define BTRFS_NR_RAID_TYPES 5 + +enum btrfs_raid_types { + BTRFS_RAID_RAID10, + BTRFS_RAID_RAID1, + BTRFS_RAID_DUP, + BTRFS_RAID_RAID0, + BTRFS_RAID_SINGLE, + BTRFS_RAID_RAID5, + BTRFS_RAID_RAID6, + BTRFS_NR_RAID_TYPES +}; #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ BTRFS_BLOCK_GROUP_SYSTEM | \ @@ -961,6 +980,8 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6 | \ BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID10) /* @@ -1185,6 +1206,10 @@ struct btrfs_block_group_cache { u64 flags; u64 sectorsize; u64 cache_generation; + + /* for raid56, this is a full stripe, without parity */ + unsigned long full_stripe_len; + unsigned int ro:1; unsigned int dirty:1; unsigned int iref:1; @@ -1225,6 +1250,28 @@ struct seq_list { u64 seq; }; +enum btrfs_orphan_cleanup_state { + ORPHAN_CLEANUP_STARTED = 1, + ORPHAN_CLEANUP_DONE = 2, +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash { + struct list_head hash_list; + wait_queue_head_t wait; + spinlock_t lock; +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash_table { + struct list_head stripe_cache; + spinlock_t cache_lock; + int cache_size; + struct btrfs_stripe_hash table[]; +}; + +#define BTRFS_STRIPE_HASH_TABLE_BITS 11 + /* fs_info */ struct reloc_control; struct btrfs_device; @@ -1250,6 +1297,7 @@ struct btrfs_fs_info { /* block group cache stuff */ spinlock_t block_group_cache_lock; + u64 first_logical_byte; struct rb_root block_group_cache_tree; /* keep track of unallocated space */ @@ -1288,7 +1336,23 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long mount_opt; unsigned long compress_type:4; + /* + * It is a suggestive number, the read side is safe even it gets a + * wrong number because we will write out the data into a regular + * extent. The write side(mount/remount) is under ->s_umount lock, + * so it is also safe. + */ u64 max_inline; + /* + * Protected by ->chunk_mutex and sb->s_umount. + * + * The reason that we use two lock to protect it is because only + * remount and mount operations can change it and these two operations + * are under sb->s_umount, but the read side (chunk allocation) can not + * acquire sb->s_umount or the deadlock would happen. So we use two + * locks to protect it. On the write side, we must acquire two locks, + * and on the read side, we just need acquire one of them. + */ u64 alloc_start; struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; @@ -1307,6 +1371,13 @@ struct btrfs_fs_info { struct mutex cleaner_mutex; struct mutex chunk_mutex; struct mutex volume_mutex; + + /* this is used during read/modify/write to make sure + * no two ios are trying to mod the same stripe at the same + * time + */ + struct btrfs_stripe_hash_table *stripe_hash_table; + /* * this protects the ordered operations list only while we are * processing all of the entries on it. This way we make @@ -1365,6 +1436,7 @@ struct btrfs_fs_info { */ struct list_head ordered_extents; + spinlock_t delalloc_lock; /* * all of the inodes that have delalloc bytes. It is possible for * this list to be empty even when there is still dirty data=ordered @@ -1373,13 +1445,6 @@ struct btrfs_fs_info { struct list_head delalloc_inodes; /* - * special rename and truncate targets that must be on disk before - * we're allowed to commit. This is basically the ext3 style - * data=ordered list. - */ - struct list_head ordered_operations; - - /* * there is a pool of worker threads for checksumming during writes * and a pool for checksumming after reads. This is because readers * can run with FS locks held, and the writers may be waiting for @@ -1395,6 +1460,8 @@ struct btrfs_fs_info { struct btrfs_workers flush_workers; struct btrfs_workers endio_workers; struct btrfs_workers endio_meta_workers; + struct btrfs_workers endio_raid56_workers; + struct btrfs_workers rmw_workers; struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; struct btrfs_workers endio_freespace_worker; @@ -1423,10 +1490,12 @@ struct btrfs_fs_info { u64 total_pinned; - /* protected by the delalloc lock, used to keep from writing - * metadata until there is a nice batch - */ - u64 dirty_metadata_bytes; + /* used to keep from writing metadata until there is a nice batch */ + struct percpu_counter dirty_metadata_bytes; + struct percpu_counter delalloc_bytes; + s32 dirty_metadata_batch; + s32 delalloc_batch; + struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; @@ -1442,9 +1511,6 @@ struct btrfs_fs_info { struct reloc_control *reloc_ctl; - spinlock_t delalloc_lock; - u64 delalloc_bytes; - /* data_alloc_cluster is only used in ssd mode */ struct btrfs_free_cluster data_alloc_cluster; @@ -1456,6 +1522,8 @@ struct btrfs_fs_info { struct rb_root defrag_inodes; atomic_t defrag_running; + /* Used to protect avail_{data, metadata, system}_alloc_bits */ + seqlock_t profiles_lock; /* * these three are in extended format (availability of single * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other @@ -1520,7 +1588,7 @@ struct btrfs_fs_info { u64 qgroup_seq; /* filesystem state */ - u64 fs_state; + unsigned long fs_state; struct btrfs_delayed_root *delayed_root; @@ -1623,6 +1691,9 @@ struct btrfs_root { struct list_head root_list; + spinlock_t log_extents_lock[2]; + struct list_head logged_list[2]; + spinlock_t orphan_lock; atomic_t orphan_inodes; struct btrfs_block_rsv *orphan_block_rsv; @@ -1832,6 +1903,7 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) /* @@ -2936,8 +3008,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 num_bytes, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3035,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_release_metadata(struct inode *inode); -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int nitems, + u64 *qgroup_reserved); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved); int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); @@ -3092,10 +3168,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, - int cache_only, u64 min_trans); + u64 min_trans); int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_key *max_key, - struct btrfs_path *path, int cache_only, + struct btrfs_path *path, u64 min_trans); enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_NEW, @@ -3148,7 +3224,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, int find_higher, int return_any); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, int cache_only, u64 *last_ret, + int start_slot, u64 *last_ret, struct btrfs_key *progress); void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); @@ -3459,9 +3535,9 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, u64 new_dirid); -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, unsigned long bio_flags); - +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); @@ -3543,7 +3619,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int cache_only); + struct btrfs_root *root); /* sysfs.c */ int btrfs_init_sysfs(void); @@ -3620,11 +3696,14 @@ __printf(5, 6) void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ #define btrfs_panic(fs_info, errno, fmt, args...) \ do { \ - struct btrfs_fs_info *_i = (fs_info); \ - __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ - BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ } while (0) /* acl.c */ @@ -3745,4 +3824,11 @@ static inline int is_fstree(u64 rootid) return 1; return 0; } + +static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) +{ + return signal_pending(current); +} + + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 34836036f01b..14fce27b4780 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -22,8 +22,9 @@ #include "disk-io.h" #include "transaction.h" -#define BTRFS_DELAYED_WRITEBACK 400 -#define BTRFS_DELAYED_BACKGROUND 100 +#define BTRFS_DELAYED_WRITEBACK 512 +#define BTRFS_DELAYED_BACKGROUND 128 +#define BTRFS_DELAYED_BATCH 16 static struct kmem_cache *delayed_node_cache; @@ -494,6 +495,15 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, BTRFS_DELAYED_DELETION_ITEM); } +static void finish_one_item(struct btrfs_delayed_root *delayed_root) +{ + int seq = atomic_inc_return(&delayed_root->items_seq); + if ((atomic_dec_return(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && + waitqueue_active(&delayed_root->wait)) + wake_up(&delayed_root->wait); +} + static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { struct rb_root *root; @@ -512,10 +522,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) rb_erase(&delayed_item->rb_node, root); delayed_item->delayed_node->count--; - if (atomic_dec_return(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND && - waitqueue_active(&delayed_root->wait)) - wake_up(&delayed_root->wait); + + finish_one_item(delayed_root); } static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) @@ -875,7 +883,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_delayed_item *delayed_item) { struct extent_buffer *leaf; - struct btrfs_item *item; char *ptr; int ret; @@ -886,7 +893,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; - item = btrfs_item_nr(leaf, path->slots[0]); ptr = btrfs_item_ptr(leaf, path->slots[0], char); write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, @@ -1058,39 +1064,29 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) delayed_node->count--; delayed_root = delayed_node->root->fs_info->delayed_root; - if (atomic_dec_return(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND && - waitqueue_active(&delayed_root->wait)) - wake_up(&delayed_root->wait); + finish_one_item(delayed_root); } } -static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_node *node) +static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) { struct btrfs_key key; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; int ret; - mutex_lock(&node->mutex); - if (!node->inode_dirty) { - mutex_unlock(&node->mutex); - return 0; - } - key.objectid = node->inode_id; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; + ret = btrfs_lookup_inode(trans, root, path, &key, 1); if (ret > 0) { btrfs_release_path(path); - mutex_unlock(&node->mutex); return -ENOENT; } else if (ret < 0) { - mutex_unlock(&node->mutex); return ret; } @@ -1105,11 +1101,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_delayed_inode_release_metadata(root, node); btrfs_release_delayed_inode(node); - mutex_unlock(&node->mutex); return 0; } +static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + int ret; + + mutex_lock(&node->mutex); + if (!node->inode_dirty) { + mutex_unlock(&node->mutex); + return 0; + } + + ret = __btrfs_update_delayed_inode(trans, root, path, node); + mutex_unlock(&node->mutex); + return ret; +} + +static inline int +__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + int ret; + + ret = btrfs_insert_delayed_items(trans, path, node->root, node); + if (ret) + return ret; + + ret = btrfs_delete_delayed_items(trans, path, node->root, node); + if (ret) + return ret; + + ret = btrfs_update_delayed_inode(trans, node->root, path, node); + return ret; +} + /* * Called when committing the transaction. * Returns 0 on success. @@ -1119,7 +1151,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int nr) { - struct btrfs_root *curr_root = root; struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; @@ -1142,15 +1173,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, curr_node = btrfs_first_delayed_node(delayed_root); while (curr_node && (!count || (count && nr--))) { - curr_root = curr_node->root; - ret = btrfs_insert_delayed_items(trans, path, curr_root, - curr_node); - if (!ret) - ret = btrfs_delete_delayed_items(trans, path, - curr_root, curr_node); - if (!ret) - ret = btrfs_update_delayed_inode(trans, curr_root, - path, curr_node); + ret = __btrfs_commit_inode_delayed_items(trans, path, + curr_node); if (ret) { btrfs_release_delayed_node(curr_node); curr_node = NULL; @@ -1183,51 +1207,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, return __btrfs_run_delayed_items(trans, root, nr); } -static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_delayed_node *node) +int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct inode *inode) { + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret; + if (!delayed_node) + return 0; + + mutex_lock(&delayed_node->mutex); + if (!delayed_node->count) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; + } + mutex_unlock(&delayed_node->mutex); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &node->root->fs_info->delayed_block_rsv; + trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; - ret = btrfs_insert_delayed_items(trans, path, node->root, node); - if (!ret) - ret = btrfs_delete_delayed_items(trans, path, node->root, node); - if (!ret) - ret = btrfs_update_delayed_inode(trans, node->root, path, node); - btrfs_free_path(path); + ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + btrfs_release_delayed_node(delayed_node); + btrfs_free_path(path); trans->block_rsv = block_rsv; + return ret; } -int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, - struct inode *inode) +int btrfs_commit_inode_delayed_inode(struct inode *inode) { + struct btrfs_trans_handle *trans; struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret; if (!delayed_node) return 0; mutex_lock(&delayed_node->mutex); - if (!delayed_node->count) { + if (!delayed_node->inode_dirty) { mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); return 0; } mutex_unlock(&delayed_node->mutex); - ret = __btrfs_commit_inode_delayed_items(trans, delayed_node); + trans = btrfs_join_transaction(delayed_node->root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto trans_out; + } + path->leave_spinning = 1; + + block_rsv = trans->block_rsv; + trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; + + mutex_lock(&delayed_node->mutex); + if (delayed_node->inode_dirty) + ret = __btrfs_update_delayed_inode(trans, delayed_node->root, + path, delayed_node); + else + ret = 0; + mutex_unlock(&delayed_node->mutex); + + btrfs_free_path(path); + trans->block_rsv = block_rsv; +trans_out: + btrfs_end_transaction(trans, delayed_node->root); + btrfs_btree_balance_dirty(delayed_node->root); +out: btrfs_release_delayed_node(delayed_node); + return ret; } @@ -1243,48 +1309,49 @@ void btrfs_remove_delayed_node(struct inode *inode) btrfs_release_delayed_node(delayed_node); } -struct btrfs_async_delayed_node { - struct btrfs_root *root; - struct btrfs_delayed_node *delayed_node; +struct btrfs_async_delayed_work { + struct btrfs_delayed_root *delayed_root; + int nr; struct btrfs_work work; }; -static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) +static void btrfs_async_run_delayed_root(struct btrfs_work *work) { - struct btrfs_async_delayed_node *async_node; + struct btrfs_async_delayed_work *async_work; + struct btrfs_delayed_root *delayed_root; struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; - int need_requeue = 0; - int ret; + int total_done = 0; - async_node = container_of(work, struct btrfs_async_delayed_node, work); + async_work = container_of(work, struct btrfs_async_delayed_work, work); + delayed_root = async_work->delayed_root; path = btrfs_alloc_path(); if (!path) goto out; - path->leave_spinning = 1; - delayed_node = async_node->delayed_node; +again: + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2) + goto free_path; + + delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + if (!delayed_node) + goto free_path; + + path->leave_spinning = 1; root = delayed_node->root; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) - goto free_path; + goto release_path; block_rsv = trans->block_rsv; trans->block_rsv = &root->fs_info->delayed_block_rsv; - ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); - if (!ret) - ret = btrfs_delete_delayed_items(trans, path, root, - delayed_node); - - if (!ret) - btrfs_update_delayed_inode(trans, root, path, delayed_node); - + __btrfs_commit_inode_delayed_items(trans, path, delayed_node); /* * Maybe new delayed items have been inserted, so we need requeue * the work. Besides that, we must dequeue the empty delayed nodes @@ -1310,57 +1377,47 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) * Task1 will sleep until the transaction is commited. */ mutex_lock(&delayed_node->mutex); - if (delayed_node->count) - need_requeue = 1; - else - btrfs_dequeue_delayed_node(root->fs_info->delayed_root, - delayed_node); + btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node); mutex_unlock(&delayed_node->mutex); trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); btrfs_btree_balance_dirty_nodelay(root); + +release_path: + btrfs_release_path(path); + total_done++; + + btrfs_release_prepared_delayed_node(delayed_node); + if (async_work->nr == 0 || total_done < async_work->nr) + goto again; + free_path: btrfs_free_path(path); out: - if (need_requeue) - btrfs_requeue_work(&async_node->work); - else { - btrfs_release_prepared_delayed_node(delayed_node); - kfree(async_node); - } + wake_up(&delayed_root->wait); + kfree(async_work); } + static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, - struct btrfs_root *root, int all) + struct btrfs_root *root, int nr) { - struct btrfs_async_delayed_node *async_node; - struct btrfs_delayed_node *curr; - int count = 0; + struct btrfs_async_delayed_work *async_work; -again: - curr = btrfs_first_prepared_delayed_node(delayed_root); - if (!curr) + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return 0; - async_node = kmalloc(sizeof(*async_node), GFP_NOFS); - if (!async_node) { - btrfs_release_prepared_delayed_node(curr); + async_work = kmalloc(sizeof(*async_work), GFP_NOFS); + if (!async_work) return -ENOMEM; - } - - async_node->root = root; - async_node->delayed_node = curr; - async_node->work.func = btrfs_async_run_delayed_node_done; - async_node->work.flags = 0; - - btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work); - count++; - - if (all || count < 4) - goto again; + async_work->delayed_root = delayed_root; + async_work->work.func = btrfs_async_run_delayed_root; + async_work->work.flags = 0; + async_work->nr = nr; + btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work); return 0; } @@ -1371,30 +1428,55 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root) WARN_ON(btrfs_first_delayed_node(delayed_root)); } +static int refs_newer(struct btrfs_delayed_root *delayed_root, + int seq, int count) +{ + int val = atomic_read(&delayed_root->items_seq); + + if (val < seq || val >= seq + count) + return 1; + return 0; +} + void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; + int seq; delayed_root = btrfs_get_delayed_root(root); if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return; + seq = atomic_read(&delayed_root->items_seq); + if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { int ret; - ret = btrfs_wq_run_delayed_node(delayed_root, root, 1); + DEFINE_WAIT(__wait); + + ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); if (ret) return; - wait_event_interruptible_timeout( - delayed_root->wait, - (atomic_read(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND), - HZ); - return; + while (1) { + prepare_to_wait(&delayed_root->wait, &__wait, + TASK_INTERRUPTIBLE); + + if (refs_newer(delayed_root, seq, + BTRFS_DELAYED_BATCH) || + atomic_read(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND) { + break; + } + if (!signal_pending(current)) + schedule(); + else + break; + } + finish_wait(&delayed_root->wait, &__wait); } - btrfs_wq_run_delayed_node(delayed_root, root, 0); + btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); } /* Will return 0 or -ENOMEM */ diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 4f808e1baeed..1d5c5f7abe3e 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -43,6 +43,7 @@ struct btrfs_delayed_root { */ struct list_head prepare_list; atomic_t items; /* for delayed items */ + atomic_t items_seq; /* for delayed items */ int nodes; /* for delayed nodes */ wait_queue_head_t wait; }; @@ -86,6 +87,7 @@ static inline void btrfs_init_delayed_root( struct btrfs_delayed_root *delayed_root) { atomic_set(&delayed_root->items, 0); + atomic_set(&delayed_root->items_seq, 0); delayed_root->nodes = 0; spin_lock_init(&delayed_root->lock); init_waitqueue_head(&delayed_root->wait); @@ -117,6 +119,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, /* Used for evicting the inode. */ void btrfs_remove_delayed_node(struct inode *inode); void btrfs_kill_delayed_inode_items(struct inode *inode); +int btrfs_commit_inode_delayed_inode(struct inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ae9411773397..b7a0641ead77 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -23,6 +23,10 @@ #include "delayed-ref.h" #include "transaction.h" +struct kmem_cache *btrfs_delayed_ref_head_cachep; +struct kmem_cache *btrfs_delayed_tree_ref_cachep; +struct kmem_cache *btrfs_delayed_data_ref_cachep; +struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * delayed back reference update tracking. For subvolume trees * we queue up extent allocations and backref maintenance for @@ -422,6 +426,14 @@ again: return 1; } +void btrfs_release_ref_cluster(struct list_head *cluster) +{ + struct list_head *pos, *q; + + list_for_each_safe(pos, q, cluster) + list_del_init(pos); +} + /* * helper function to update an extent delayed ref in the * rbtree. existing and update must both have the same @@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, ref->extent_op->flags_to_set; existing_ref->extent_op->update_flags = 1; } - kfree(ref->extent_op); + btrfs_free_delayed_extent_op(ref->extent_op); } } /* @@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(head_ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); } else { delayed_refs->num_heads++; delayed_refs->num_heads_ready++; @@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(full_ref); + kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; @@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(full_ref); + kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; @@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; BUG_ON(extent_op && extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); + ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kfree(ref); + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); return -ENOMEM; } @@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; BUG_ON(extent_op && !extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); + ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kfree(ref); + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); return -ENOMEM; } @@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) return -ENOMEM; @@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) return btrfs_delayed_node_to_head(ref); return NULL; } + +void btrfs_delayed_ref_exit(void) +{ + if (btrfs_delayed_ref_head_cachep) + kmem_cache_destroy(btrfs_delayed_ref_head_cachep); + if (btrfs_delayed_tree_ref_cachep) + kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); + if (btrfs_delayed_data_ref_cachep) + kmem_cache_destroy(btrfs_delayed_data_ref_cachep); + if (btrfs_delayed_extent_op_cachep) + kmem_cache_destroy(btrfs_delayed_extent_op_cachep); +} + +int btrfs_delayed_ref_init(void) +{ + btrfs_delayed_ref_head_cachep = kmem_cache_create( + "btrfs_delayed_ref_head", + sizeof(struct btrfs_delayed_ref_head), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_ref_head_cachep) + goto fail; + + btrfs_delayed_tree_ref_cachep = kmem_cache_create( + "btrfs_delayed_tree_ref", + sizeof(struct btrfs_delayed_tree_ref), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_tree_ref_cachep) + goto fail; + + btrfs_delayed_data_ref_cachep = kmem_cache_create( + "btrfs_delayed_data_ref", + sizeof(struct btrfs_delayed_data_ref), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_data_ref_cachep) + goto fail; + + btrfs_delayed_extent_op_cachep = kmem_cache_create( + "btrfs_delayed_extent_op", + sizeof(struct btrfs_delayed_extent_op), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_extent_op_cachep) + goto fail; + + return 0; +fail: + btrfs_delayed_ref_exit(); + return -ENOMEM; +} diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c9d703693df0..f75fcaf79aeb 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root { unsigned long num_heads_ready; /* + * bumped when someone is making progress on the delayed + * refs, so that other procs know they are just adding to + * contention intead of helping + */ + atomic_t procs_running_refs; + atomic_t ref_seq; + wait_queue_head_t wait; + + /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need * to be run right away @@ -141,12 +150,47 @@ struct btrfs_delayed_ref_root { u64 run_delayed_start; }; +extern struct kmem_cache *btrfs_delayed_ref_head_cachep; +extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; +extern struct kmem_cache *btrfs_delayed_data_ref_cachep; +extern struct kmem_cache *btrfs_delayed_extent_op_cachep; + +int btrfs_delayed_ref_init(void); +void btrfs_delayed_ref_exit(void); + +static inline struct btrfs_delayed_extent_op * +btrfs_alloc_delayed_extent_op(void) +{ + return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS); +} + +static inline void +btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) +{ + if (op) + kmem_cache_free(btrfs_delayed_extent_op_cachep, op); +} + static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { WARN_ON(atomic_read(&ref->refs) == 0); if (atomic_dec_and_test(&ref->refs)) { WARN_ON(ref->in_tree); - kfree(ref); + switch (ref->type) { + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + break; + case 0: + kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); + break; + default: + BUG(); + } } } @@ -176,8 +220,14 @@ struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head); +static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) +{ + mutex_unlock(&head->mutex); +} + int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); +void btrfs_release_ref_cluster(struct list_head *cluster); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 66dbc8dbddf7..7ba7b3900cb8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - btrfs_start_delalloc_inodes(root, 0); + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return ret; + } btrfs_wait_ordered_extents(root, 0); trans = btrfs_start_transaction(root, 0); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a8f652dc940b..6d19a0a554aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -46,6 +46,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "raid56.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -56,11 +57,12 @@ static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); -static void btrfs_destroy_ordered_operations(struct btrfs_root *root); +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, + struct btrfs_root *root); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t); static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); static int btrfs_destroy_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, @@ -420,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { struct extent_io_tree *tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 found_start; struct extent_buffer *eb; @@ -639,8 +641,15 @@ err: btree_readahead_hook(root, eb, eb->start, ret); } - if (ret) + if (ret) { + /* + * our io error hook is going to dec the io pages + * again, we have to make sure it has something + * to decrement + */ + atomic_inc(&eb->io_pages); clear_extent_buffer_uptodate(eb); + } free_extent_buffer(eb); out: return ret; @@ -654,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb = (struct extent_buffer *)page->private; set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); eb->read_mirror = failed_mirror; + atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, -EIO); return -EIO; /* we fixed nothing */ @@ -670,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err) end_io_wq->work.flags = 0; if (bio->bi_rw & REQ_WRITE) { - if (end_io_wq->metadata == 1) + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) btrfs_queue_worker(&fs_info->endio_meta_write_workers, &end_io_wq->work); - else if (end_io_wq->metadata == 2) + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) btrfs_queue_worker(&fs_info->endio_freespace_worker, &end_io_wq->work); + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) + btrfs_queue_worker(&fs_info->endio_raid56_workers, + &end_io_wq->work); else btrfs_queue_worker(&fs_info->endio_write_workers, &end_io_wq->work); } else { - if (end_io_wq->metadata) + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) + btrfs_queue_worker(&fs_info->endio_raid56_workers, + &end_io_wq->work); + else if (end_io_wq->metadata) btrfs_queue_worker(&fs_info->endio_meta_workers, &end_io_wq->work); else @@ -695,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err) * 0 - if data * 1 - if normal metadta * 2 - if writing to the free space cache area + * 3 - raid parity work */ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata) @@ -946,18 +963,20 @@ static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_io_tree *tree; + struct btrfs_fs_info *fs_info; + int ret; + tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { - struct btrfs_root *root = BTRFS_I(mapping->host)->root; - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; if (wbc->for_kupdate) return 0; + fs_info = BTRFS_I(mapping->host)->root->fs_info; /* this is a bit racy, but that's ok */ - num_dirty = root->fs_info->dirty_metadata_bytes; - if (num_dirty < thresh) + ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret < 0) return 0; } return btree_write_cache_pages(mapping, wbc); @@ -1125,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + struct btrfs_fs_info *fs_info = root->fs_info; + if (btrfs_header_generation(buf) == - root->fs_info->running_transaction->transid) { + fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= buf->len) - root->fs_info->dirty_metadata_bytes -= buf->len; - else { - spin_unlock(&root->fs_info->delalloc_lock); - btrfs_panic(root->fs_info, -EOVERFLOW, - "Can't clear %lu bytes from " - " dirty_mdatadata_bytes (%llu)", - buf->len, - root->fs_info->dirty_metadata_bytes); - } - spin_unlock(&root->fs_info->delalloc_lock); - + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -buf->len, + fs_info->dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(buf); @@ -1178,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD(&root->logged_list[0]); + INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); spin_lock_init(&root->accounting_lock); + spin_lock_init(&root->log_extents_lock[0]); + spin_lock_init(&root->log_extents_lock[1]); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); init_waitqueue_head(&root->log_writer_wait); @@ -1276,6 +1291,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); + leaf = NULL; goto fail; } @@ -1319,11 +1335,16 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(leaf); + return root; + fail: - if (ret) - return ERR_PTR(ret); + if (leaf) { + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + } + kfree(root); - return root; + return ERR_PTR(ret); } static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, @@ -2004,10 +2025,24 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + if (ret) { + err = ret; + goto fail_bdi; + } + fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * + (1 + ilog2(nr_cpu_ids)); + + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + if (ret) { + err = ret; + goto fail_dirty_metadata_bytes; + } + fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_bdi; + goto fail_delalloc_bytes; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2017,7 +2052,6 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delalloc_inodes); - INIT_LIST_HEAD(&fs_info->ordered_operations); INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->trans_lock); @@ -2028,6 +2062,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->tree_mod_seq_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); + seqlock_init(&fs_info->profiles_lock); init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -2126,6 +2161,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; + fs_info->first_logical_byte = (u64)-1; extent_io_tree_init(&fs_info->freed_extents[0], fs_info->btree_inode->i_mapping); @@ -2165,6 +2201,12 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + ret = btrfs_alloc_stripe_hash_table(fs_info); + if (ret) { + err = ret; + goto fail_alloc; + } + __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); @@ -2187,7 +2229,8 @@ int open_ctree(struct super_block *sb, goto fail_alloc; /* check FS state, whether FS is broken. */ - fs_info->fs_state |= btrfs_super_flags(disk_super); + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { @@ -2261,6 +2304,8 @@ int open_ctree(struct super_block *sb, leafsize = btrfs_super_leafsize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); stripesize = btrfs_super_stripesize(disk_super); + fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); + fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); /* * mixed block groups end up with duplicate but slightly offset @@ -2332,6 +2377,12 @@ int open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->endio_meta_write_workers, "endio-meta-write", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_raid56_workers, + "endio-raid56", fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->rmw_workers, + "rmw", fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", fs_info->thread_pool_size, &fs_info->generic_worker); @@ -2350,6 +2401,8 @@ int open_ctree(struct super_block *sb, */ fs_info->endio_workers.idle_thresh = 4; fs_info->endio_meta_workers.idle_thresh = 4; + fs_info->endio_raid56_workers.idle_thresh = 4; + fs_info->rmw_workers.idle_thresh = 2; fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; @@ -2366,6 +2419,8 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->fixup_workers); ret |= btrfs_start_workers(&fs_info->endio_workers); ret |= btrfs_start_workers(&fs_info->endio_meta_workers); + ret |= btrfs_start_workers(&fs_info->rmw_workers); + ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); ret |= btrfs_start_workers(&fs_info->endio_write_workers); ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); @@ -2390,8 +2445,7 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, - sizeof(disk_super->magic))) { + if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) { printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2694,13 +2748,13 @@ fail_cleaner: * kthreads */ filemap_write_and_wait(fs_info->btree_inode->i_mapping); - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_block_groups: btrfs_free_block_groups(fs_info); fail_tree_roots: free_root_pointers(fs_info, 1); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_sb_buffer: btrfs_stop_workers(&fs_info->generic_worker); @@ -2710,6 +2764,8 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->workers); btrfs_stop_workers(&fs_info->endio_workers); btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_raid56_workers); + btrfs_stop_workers(&fs_info->rmw_workers); btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->endio_freespace_worker); @@ -2721,13 +2777,17 @@ fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); +fail_delalloc_bytes: + percpu_counter_destroy(&fs_info->delalloc_bytes); +fail_dirty_metadata_bytes: + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: + btrfs_free_stripe_hash_table(fs_info); btrfs_close_devices(fs_info->fs_devices); return err; @@ -2795,8 +2855,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) super = (struct btrfs_super_block *)bh->b_data; if (btrfs_super_bytenr(super) != bytenr || - strncmp((char *)(&super->magic), BTRFS_MAGIC, - sizeof(super->magic))) { + super->magic != cpu_to_le64(BTRFS_MAGIC)) { brelse(bh); continue; } @@ -3076,11 +3135,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))) num_tolerated_disk_barrier_failures = 0; - else if (num_tolerated_disk_barrier_failures > 1 - && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) - num_tolerated_disk_barrier_failures = 1; + else if (num_tolerated_disk_barrier_failures > 1) { + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID10)) { + num_tolerated_disk_barrier_failures = 1; + } else if (flags & + BTRFS_BLOCK_GROUP_RAID5) { + num_tolerated_disk_barrier_failures = 2; + } + } } } up_read(&sinfo->groups_sem); @@ -3195,6 +3259,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + btrfs_free_log(NULL, root); + btrfs_free_log_root_tree(NULL, fs_info); + } + __btrfs_remove_free_space_cache(root->free_ino_pinned); __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); @@ -3339,7 +3408,7 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_error_commit_super(root); btrfs_put_block_group_cache(fs_info); @@ -3352,9 +3421,9 @@ int close_ctree(struct btrfs_root *root) btrfs_free_qgroup_config(root->fs_info); - if (fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - (unsigned long long)fs_info->delalloc_bytes); + if (percpu_counter_sum(&fs_info->delalloc_bytes)) { + printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", + percpu_counter_sum(&fs_info->delalloc_bytes)); } free_extent_buffer(fs_info->extent_root->node); @@ -3384,6 +3453,8 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->workers); btrfs_stop_workers(&fs_info->endio_workers); btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_raid56_workers); + btrfs_stop_workers(&fs_info->rmw_workers); btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->endio_freespace_worker); @@ -3401,9 +3472,13 @@ int close_ctree(struct btrfs_root *root) btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); + btrfs_free_stripe_hash_table(fs_info); + return 0; } @@ -3443,11 +3518,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)transid, (unsigned long long)root->fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += buf->len; - spin_unlock(&root->fs_info->delalloc_lock); - } + if (!was_dirty) + __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, + buf->len, + root->fs_info->dirty_metadata_batch); } static void __btrfs_btree_balance_dirty(struct btrfs_root *root, @@ -3457,8 +3531,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, * looks as though older kernels can get into trouble with * this code, they end up stuck in balance_dirty_pages forever */ - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; + int ret; if (current->flags & PF_MEMALLOC) return; @@ -3466,9 +3539,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, if (flush_delayed) btrfs_balance_delayed_items(root); - num_dirty = root->fs_info->dirty_metadata_bytes; - - if (num_dirty > thresh) { + ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret > 0) { balance_dirty_pages_ratelimited( root->fs_info->btree_inode->i_mapping); } @@ -3518,7 +3591,8 @@ void btrfs_error_commit_super(struct btrfs_root *root) btrfs_cleanup_transaction(root); } -static void btrfs_destroy_ordered_operations(struct btrfs_root *root) +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, + struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; struct list_head splice; @@ -3528,7 +3602,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); - list_splice_init(&root->fs_info->ordered_operations, &splice); + list_splice_init(&t->ordered_operations, &splice); while (!list_empty(&splice)) { btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); @@ -3544,35 +3618,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { - struct list_head splice; struct btrfs_ordered_extent *ordered; - struct inode *inode; - - INIT_LIST_HEAD(&splice); spin_lock(&root->fs_info->ordered_extent_lock); - - list_splice_init(&root->fs_info->ordered_extents, &splice); - while (!list_empty(&splice)) { - ordered = list_entry(splice.next, struct btrfs_ordered_extent, - root_extent_list); - - list_del_init(&ordered->root_extent_list); - atomic_inc(&ordered->refs); - - /* the inode may be getting freed (in sys_unlink path). */ - inode = igrab(ordered->inode); - - spin_unlock(&root->fs_info->ordered_extent_lock); - if (inode) - iput(inode); - - atomic_set(&ordered->refs, 1); - btrfs_put_ordered_extent(ordered); - - spin_lock(&root->fs_info->ordered_extent_lock); - } - + /* + * This will just short circuit the ordered completion stuff which will + * make sure the ordered extent gets properly cleaned up. + */ + list_for_each_entry(ordered, &root->fs_info->ordered_extents, + root_extent_list) + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); spin_unlock(&root->fs_info->ordered_extent_lock); } @@ -3594,11 +3649,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, } while ((node = rb_first(&delayed_refs->root)) != NULL) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + struct btrfs_delayed_ref_head *head = NULL; + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); atomic_set(&ref->refs, 1); if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; head = btrfs_delayed_node_to_head(ref); if (!mutex_trylock(&head->mutex)) { @@ -3614,16 +3669,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } - kfree(head->extent_op); + btrfs_free_delayed_extent_op(head->extent_op); delayed_refs->num_heads--; if (list_empty(&head->cluster)) delayed_refs->num_heads_ready--; list_del_init(&head->cluster); } + ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - + if (head) + mutex_unlock(&head->mutex); spin_unlock(&delayed_refs->lock); btrfs_put_delayed_ref(ref); @@ -3636,7 +3693,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t) { struct btrfs_pending_snapshot *snapshot; struct list_head splice; @@ -3649,10 +3706,8 @@ static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) snapshot = list_entry(splice.next, struct btrfs_pending_snapshot, list); - + snapshot->error = -ECANCELED; list_del_init(&snapshot->list); - - kfree(snapshot); } } @@ -3671,6 +3726,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) delalloc_inodes); list_del_init(&btrfs_inode->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &btrfs_inode->runtime_flags); btrfs_invalidate_inodes(btrfs_inode->root); } @@ -3787,6 +3844,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->blocked = 1; wake_up(&root->fs_info->transaction_blocked_wait); + btrfs_evict_pending_snapshots(cur_trans); + cur_trans->blocked = 0; wake_up(&root->fs_info->transaction_wait); @@ -3796,8 +3855,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_pending_snapshots(cur_trans); - btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(root, @@ -3823,10 +3880,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) while (!list_empty(&list)) { t = list_entry(list.next, struct btrfs_transaction, list); - if (!t) - break; - btrfs_destroy_ordered_operations(root); + btrfs_destroy_ordered_operations(t, root); btrfs_destroy_ordered_extents(root); @@ -3843,6 +3898,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) wake_up(&root->fs_info->transaction_blocked_wait); + btrfs_evict_pending_snapshots(t); + t->blocked = 0; smp_mb(); if (waitqueue_active(&root->fs_info->transaction_wait)) @@ -3856,8 +3913,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_pending_snapshots(t); - btrfs_destroy_delalloc_inodes(root); spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 305c33efb0e3..034d7dc552b2 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,6 +25,13 @@ #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 +enum { + BTRFS_WQ_ENDIO_DATA = 0, + BTRFS_WQ_ENDIO_METADATA = 1, + BTRFS_WQ_ENDIO_FREE_SPACE = 2, + BTRFS_WQ_ENDIO_RAID56 = 3, +}; + static inline u64 btrfs_sb_offset(int mirror) { u64 start = 16 * 1024; diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 614f34a899c2..81ee29eeb7ca 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -22,10 +22,10 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) { *max_len = BTRFS_FID_SIZE_CONNECTABLE; - return 255; + return FILEID_INVALID; } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; - return 255; + return FILEID_INVALID; } len = BTRFS_FID_SIZE_NON_CONNECTABLE; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a8b8adc05070..3d551231caba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -31,6 +31,7 @@ #include "print-tree.h" #include "transaction.h" #include "volumes.h" +#include "raid56.h" #include "locking.h" #include "free-space-cache.h" #include "math.h" @@ -72,8 +73,7 @@ enum { RESERVE_ALLOC_NO_ACCOUNT = 2, }; -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int reserve); +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, rb_link_node(&block_group->cache_node, parent, p); rb_insert_color(&block_group->cache_node, &info->block_group_cache_tree); + + if (info->first_logical_byte > block_group->key.objectid) + info->first_logical_byte = block_group->key.objectid; + spin_unlock(&info->block_group_cache_lock); return 0; @@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, break; } } - if (ret) + if (ret) { btrfs_get_block_group(ret); + if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) + info->first_logical_byte = ret->key.objectid; + } spin_unlock(&info->block_group_cache_lock); return ret; @@ -248,7 +257,8 @@ static int exclude_super_stripes(struct btrfs_root *root, cache->bytes_super += stripe_len; ret = add_excluded_extent(root, cache->key.objectid, stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -256,13 +266,17 @@ static int exclude_super_stripes(struct btrfs_root *root, ret = btrfs_rmap_block(&root->fs_info->mapping_tree, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; while (nr--) { cache->bytes_super += stripe_len; ret = add_excluded_extent(root, logical[nr], stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + kfree(logical); + return ret; + } } kfree(logical); @@ -468,8 +482,6 @@ out: } static int cache_block_group(struct btrfs_block_group_cache *cache, - struct btrfs_trans_handle *trans, - struct btrfs_root *root, int load_cache_only) { DEFINE_WAIT(wait); @@ -527,12 +539,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_FAST; spin_unlock(&cache->lock); - /* - * We can't do the read from on-disk cache during a commit since we need - * to have the normal tree locking. Also if we are currently trying to - * allocate blocks for the tree root we can't do the fast caching since - * we likely hold important locks. - */ if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { ret = load_free_space_cache(fs_info, cache); @@ -1466,8 +1472,11 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, if (ret && !insert) { err = -ENOENT; goto out; + } else if (ret) { + err = -EIO; + WARN_ON(1); + goto out; } - BUG_ON(ret); /* Corruption */ leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -1852,6 +1861,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, *actual_bytes = discarded_bytes; + if (ret == -EOPNOTSUPP) + ret = 0; return ret; } @@ -2143,7 +2154,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, node->num_bytes); } } - mutex_unlock(&head->mutex); return ret; } @@ -2258,7 +2268,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * process of being added. Don't run this ref yet. */ list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); + btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); @@ -2285,7 +2295,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref = &locked_ref->node; if (extent_op && must_insert_reserved) { - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); extent_op = NULL; } @@ -2294,28 +2304,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_delayed_extent_op(trans, root, ref, extent_op); - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); if (ret) { - list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); - - printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); + printk(KERN_DEBUG + "btrfs: run_delayed_extent_op " + "returned %d\n", ret); spin_lock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(locked_ref); return ret; } goto next; } - - list_del_init(&locked_ref->cluster); - locked_ref = NULL; } ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (locked_ref) { + if (!btrfs_delayed_ref_is_head(ref)) { /* * when we play the delayed ref, also correct the * ref_mod on head @@ -2337,20 +2344,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); - btrfs_put_delayed_ref(ref); - kfree(extent_op); - count++; - + btrfs_free_delayed_extent_op(extent_op); if (ret) { - if (locked_ref) { - list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); - } - printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); + btrfs_delayed_ref_unlock(locked_ref); + btrfs_put_delayed_ref(ref); + printk(KERN_DEBUG + "btrfs: run_one_delayed_ref returned %d\n", ret); spin_lock(&delayed_refs->lock); return ret; } + /* + * If this node is a head, that means all the refs in this head + * have been dealt with, and we will pick the next head to deal + * with, so we must unlock the head and drop it from the cluster + * list before we release it. + */ + if (btrfs_delayed_ref_is_head(ref)) { + list_del_init(&locked_ref->cluster); + btrfs_delayed_ref_unlock(locked_ref); + locked_ref = NULL; + } + btrfs_put_delayed_ref(ref); + count++; next: cond_resched(); spin_lock(&delayed_refs->lock); @@ -2435,6 +2451,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, return ret; } +static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, + int count) +{ + int val = atomic_read(&delayed_refs->ref_seq); + + if (val < seq || val >= seq + count) + return 1; + return 0; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2469,6 +2495,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); + if (count == 0) { + count = delayed_refs->num_entries * 2; + run_most = 1; + } + + if (!run_all && !run_most) { + int old; + int seq = atomic_read(&delayed_refs->ref_seq); + +progress: + old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); + if (old) { + DEFINE_WAIT(__wait); + if (delayed_refs->num_entries < 16348) + return 0; + + prepare_to_wait(&delayed_refs->wait, &__wait, + TASK_UNINTERRUPTIBLE); + + old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); + if (old) { + schedule(); + finish_wait(&delayed_refs->wait, &__wait); + + if (!refs_newer(delayed_refs, seq, 256)) + goto progress; + else + return 0; + } else { + finish_wait(&delayed_refs->wait, &__wait); + goto again; + } + } + + } else { + atomic_inc(&delayed_refs->procs_running_refs); + } + again: loops = 0; spin_lock(&delayed_refs->lock); @@ -2477,10 +2541,6 @@ again: delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - if (count == 0) { - count = delayed_refs->num_entries * 2; - run_most = 1; - } while (1) { if (!(run_all || run_most) && delayed_refs->num_heads_ready < 64) @@ -2500,11 +2560,15 @@ again: ret = run_clustered_refs(trans, root, &cluster); if (ret < 0) { + btrfs_release_ref_cluster(&cluster); spin_unlock(&delayed_refs->lock); btrfs_abort_transaction(trans, root, ret); + atomic_dec(&delayed_refs->procs_running_refs); return ret; } + atomic_add(ret, &delayed_refs->ref_seq); + count -= min_t(unsigned long, ret, count); if (count == 0) @@ -2573,6 +2637,11 @@ again: goto again; } out: + atomic_dec(&delayed_refs->procs_running_refs); + smp_mb(); + if (waitqueue_active(&delayed_refs->wait)) + wake_up(&delayed_refs->wait); + spin_unlock(&delayed_refs->lock); assert_qgroups_uptodate(trans); return 0; @@ -2586,7 +2655,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op; int ret; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + extent_op = btrfs_alloc_delayed_extent_op(); if (!extent_op) return -ENOMEM; @@ -2598,7 +2667,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, num_bytes, extent_op); if (ret) - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); return ret; } @@ -3223,12 +3292,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 extra_flags = chunk_to_extended(flags) & BTRFS_EXTENDED_PROFILE_MASK; + write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits |= extra_flags; + write_sequnlock(&fs_info->profiles_lock); } /* @@ -3276,6 +3347,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; u64 target; + u64 tmp; /* * see if restripe for this chunk_type is in progress, if so @@ -3292,40 +3364,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) } spin_unlock(&root->fs_info->balance_lock); + /* First, mask out the RAID levels which aren't possible */ if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5); + if (num_devices < 3) + flags &= ~BTRFS_BLOCK_GROUP_RAID6; if (num_devices < 4) flags &= ~BTRFS_BLOCK_GROUP_RAID10; - if ((flags & BTRFS_BLOCK_GROUP_DUP) && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) { - flags &= ~BTRFS_BLOCK_GROUP_DUP; - } + tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); + flags &= ~tmp; - if ((flags & BTRFS_BLOCK_GROUP_RAID1) && - (flags & BTRFS_BLOCK_GROUP_RAID10)) { - flags &= ~BTRFS_BLOCK_GROUP_RAID1; - } - - if ((flags & BTRFS_BLOCK_GROUP_RAID0) && - ((flags & BTRFS_BLOCK_GROUP_RAID1) | - (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) { - flags &= ~BTRFS_BLOCK_GROUP_RAID0; - } + if (tmp & BTRFS_BLOCK_GROUP_RAID6) + tmp = BTRFS_BLOCK_GROUP_RAID6; + else if (tmp & BTRFS_BLOCK_GROUP_RAID5) + tmp = BTRFS_BLOCK_GROUP_RAID5; + else if (tmp & BTRFS_BLOCK_GROUP_RAID10) + tmp = BTRFS_BLOCK_GROUP_RAID10; + else if (tmp & BTRFS_BLOCK_GROUP_RAID1) + tmp = BTRFS_BLOCK_GROUP_RAID1; + else if (tmp & BTRFS_BLOCK_GROUP_RAID0) + tmp = BTRFS_BLOCK_GROUP_RAID0; - return extended_to_chunk(flags); + return extended_to_chunk(flags | tmp); } static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { - if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits; + unsigned seq; + + do { + seq = read_seqbegin(&root->fs_info->profiles_lock); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); return btrfs_reduce_alloc_profile(root, flags); } @@ -3333,6 +3413,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { u64 flags; + u64 ret; if (data) flags = BTRFS_BLOCK_GROUP_DATA; @@ -3341,7 +3422,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) else flags = BTRFS_BLOCK_GROUP_METADATA; - return get_alloc_profile(root, flags); + ret = get_alloc_profile(root, flags); + return ret; } /* @@ -3357,7 +3439,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) int ret = 0, committed = 0, alloc_chunk = 1; /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + bytes = ALIGN(bytes, root->sectorsize); if (root == root->fs_info->tree_root || BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { @@ -3452,7 +3534,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) struct btrfs_space_info *data_sinfo; /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + bytes = ALIGN(bytes, root->sectorsize); data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); @@ -3516,8 +3598,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) { u64 num_dev; - if (type & BTRFS_BLOCK_GROUP_RAID10 || - type & BTRFS_BLOCK_GROUP_RAID0) + if (type & (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) num_dev = root->fs_info->fs_devices->rw_devices; else if (type & BTRFS_BLOCK_GROUP_RAID1) num_dev = 2; @@ -3564,6 +3648,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; + /* Don't re-enter if we're already allocating a chunk */ + if (trans->allocating_chunk) + return -ENOSPC; + space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, @@ -3606,6 +3694,8 @@ again: goto again; } + trans->allocating_chunk = true; + /* * If we have mixed data/metadata chunks we want to make sure we keep * allocating mixed chunks instead of individual chunks. @@ -3632,19 +3722,20 @@ again: check_system_chunk(trans, extent_root, flags); ret = btrfs_alloc_chunk(trans, extent_root, flags); - if (ret < 0 && ret != -ENOSPC) - goto out; + trans->allocating_chunk = false; spin_lock(&space_info->lock); + if (ret < 0 && ret != -ENOSPC) + goto out; if (ret) space_info->full = 1; else ret = 1; space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +out: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); -out: mutex_unlock(&fs_info->chunk_mutex); return ret; } @@ -3653,13 +3744,31 @@ static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 profile = btrfs_get_alloc_profile(root, 0); + u64 rsv_size = 0; u64 avail; u64 used; + u64 to_add; used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + space_info->bytes_pinned + space_info->bytes_readonly; + + spin_lock(&global_rsv->lock); + rsv_size = global_rsv->size; + spin_unlock(&global_rsv->lock); + + /* + * We only want to allow over committing if we have lots of actual space + * free, but if we don't have enough space to handle the global reserve + * space then we could end up having a real enospc problem when trying + * to allocate a chunk or some other such important allocation. + */ + rsv_size <<= 1; + if (used + rsv_size >= space_info->total_bytes) + return 0; + + used += space_info->bytes_may_use; spin_lock(&root->fs_info->free_chunk_lock); avail = root->fs_info->free_chunk_space; @@ -3667,40 +3776,58 @@ static int can_overcommit(struct btrfs_root *root, /* * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. + * space is actually useable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math */ if (profile & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) avail >>= 1; + to_add = space_info->total_bytes; + /* * If we aren't flushing all things, let us overcommit up to * 1/2th of the space. If we can flush, don't let us overcommit * too much, let it overcommit up to 1/8 of the space. */ if (flush == BTRFS_RESERVE_FLUSH_ALL) - avail >>= 3; + to_add >>= 3; else - avail >>= 1; + to_add >>= 1; - if (used + bytes < space_info->total_bytes + avail) + /* + * Limit the overcommit to the amount of free space we could possibly + * allocate for chunks. + */ + to_add = min(avail, to_add); + + if (used + bytes < space_info->total_bytes + to_add) return 1; return 0; } -static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, - unsigned long nr_pages, - enum wb_reason reason) +void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, + unsigned long nr_pages) { - if (!writeback_in_progress(sb->s_bdi) && - down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb_nr(sb, nr_pages, reason); - up_read(&sb->s_umount); - return 1; - } + struct super_block *sb = root->fs_info->sb; + int started; - return 0; + /* If we can not start writeback, just sync all the delalloc file. */ + started = try_to_writeback_inodes_sb_nr(sb, nr_pages, + WB_REASON_FS_FREE_SPACE); + if (!started) { + /* + * We needn't worry the filesystem going from r/w to r/o though + * we don't acquire ->s_umount mutex, because the filesystem + * should guarantee the delalloc inodes list be empty after + * the filesystem is readonly(all dirty pages are written to + * the disk). + */ + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0); + } } /* @@ -3724,7 +3851,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, space_info = block_rsv->space_info; smp_mb(); - delalloc_bytes = root->fs_info->delalloc_bytes; + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); if (delalloc_bytes == 0) { if (trans) return; @@ -3735,10 +3863,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; - writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, - nr_pages, - WB_REASON_FS_FREE_SPACE); - + btrfs_writeback_inodes_sb_nr(root, nr_pages); /* * We need to wait for the async pages to actually start before * we do anything. @@ -3766,7 +3891,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, break; } smp_mb(); - delalloc_bytes = root->fs_info->delalloc_bytes; + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); } } @@ -4030,6 +4156,15 @@ again: goto again; out: + if (ret == -ENOSPC && + unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { + struct btrfs_block_rsv *global_rsv = + &root->fs_info->global_block_rsv; + + if (block_rsv != global_rsv && + !block_rsv_use_bytes(global_rsv, orig_bytes)) + ret = 0; + } if (flushing) { spin_lock(&space_info->lock); space_info->flush = 0; @@ -4308,7 +4443,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); - block_rsv->size = num_bytes; + block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + @@ -4416,19 +4551,60 @@ void btrfs_orphan_release_metadata(struct inode *inode) btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending) +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * qgroup_reserved: used to return the reserved size in qgroup + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reseravtion mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int items, + u64 *qgroup_reserved) { - struct btrfs_root *root = pending->root; - struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); - struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; - /* - * two for root back/forward refs, two for directory entries, - * one for root of the snapshot and one for parent inode. - */ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); - dst_rsv->space_info = src_rsv->space_info; - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); + u64 num_bytes; + int ret; + + if (root->fs_info->quota_enabled) { + /* One for parent inode, two for dir entries */ + num_bytes = 3 * root->leafsize; + ret = btrfs_qgroup_reserve(root, num_bytes); + if (ret) + return ret; + } else { + num_bytes = 0; + } + + *qgroup_reserved = num_bytes; + + num_bytes = btrfs_calc_trans_metadata_size(root, items); + rsv->space_info = __find_space_info(root->fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + if (*qgroup_reserved) + btrfs_qgroup_free(root, *qgroup_reserved); + } + + return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved) +{ + btrfs_block_rsv_release(root, rsv, (u64)-1); + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); } /** @@ -4534,8 +4710,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) unsigned nr_extents = 0; int extra_reserve = 0; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; - int ret; + int ret = 0; bool delalloc_lock = true; + u64 to_free = 0; + unsigned dropped; /* If we are a free space inode we need to not flush since we will be in * the middle of a transaction commit. We also don't need the delalloc @@ -4582,53 +4760,16 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (root->fs_info->quota_enabled) { ret = btrfs_qgroup_reserve(root, num_bytes + nr_extents * root->leafsize); - if (ret) { - spin_lock(&BTRFS_I(inode)->lock); - calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); - if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - return ret; - } + if (ret) + goto out_fail; } ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); - if (ret) { - u64 to_free = 0; - unsigned dropped; - - spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); - /* - * If the inodes csum_bytes is the same as the original - * csum_bytes then we know we haven't raced with any free()ers - * so we can just reduce our inodes csum bytes and carry on. - * Otherwise we have to do the normal free thing to account for - * the case that the free side didn't free up its reserve - * because of this outstanding reservation. - */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) - calc_csum_metadata_size(inode, num_bytes, 0); - else - to_free = calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); - if (dropped) - to_free += btrfs_calc_trans_metadata_size(root, dropped); - - if (to_free) { - btrfs_block_rsv_release(root, block_rsv, to_free); - trace_btrfs_space_reservation(root->fs_info, - "delalloc", - btrfs_ino(inode), - to_free, 0); - } - if (root->fs_info->quota_enabled) { + if (unlikely(ret)) { + if (root->fs_info->quota_enabled) btrfs_qgroup_free(root, num_bytes + nr_extents * root->leafsize); - } - if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - return ret; + goto out_fail; } spin_lock(&BTRFS_I(inode)->lock); @@ -4649,6 +4790,69 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; + +out_fail: + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode); + /* + * If the inodes csum_bytes is the same as the original + * csum_bytes then we know we haven't raced with any free()ers + * so we can just reduce our inodes csum bytes and carry on. + */ + if (BTRFS_I(inode)->csum_bytes == csum_bytes) { + calc_csum_metadata_size(inode, num_bytes, 0); + } else { + u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; + u64 bytes; + + /* + * This is tricky, but first we need to figure out how much we + * free'd from any free-ers that occured during this + * reservation, so we reset ->csum_bytes to the csum_bytes + * before we dropped our lock, and then call the free for the + * number of bytes that were freed while we were trying our + * reservation. + */ + bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; + BTRFS_I(inode)->csum_bytes = csum_bytes; + to_free = calc_csum_metadata_size(inode, bytes, 0); + + + /* + * Now we need to see how much we would have freed had we not + * been making this reservation and our ->csum_bytes were not + * artificially inflated. + */ + BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; + bytes = csum_bytes - orig_csum_bytes; + bytes = calc_csum_metadata_size(inode, bytes, 0); + + /* + * Now reset ->csum_bytes to what it should be. If bytes is + * more than to_free then we would have free'd more space had we + * not had an artificially high ->csum_bytes, so we need to free + * the remainder. If bytes is the same or less then we don't + * need to do anything, the other free-ers did the correct + * thing. + */ + BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; + if (bytes > to_free) + to_free = bytes - to_free; + else + to_free = 0; + } + spin_unlock(&BTRFS_I(inode)->lock); + if (dropped) + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + if (to_free) { + btrfs_block_rsv_release(root, block_rsv, to_free); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); + } + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + return ret; } /** @@ -4670,7 +4874,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + if (num_bytes) + to_free = calc_csum_metadata_size(inode, num_bytes, 0); spin_unlock(&BTRFS_I(inode)->lock); if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); @@ -4737,8 +4942,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) btrfs_free_reserved_data_space(inode, num_bytes); } -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; @@ -4775,7 +4979,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, * space back to the block group, otherwise we will leak space. */ if (!alloc && cache->cached == BTRFS_CACHE_NO) - cache_block_group(cache, trans, NULL, 1); + cache_block_group(cache, 1); byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -4825,6 +5029,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) struct btrfs_block_group_cache *cache; u64 bytenr; + spin_lock(&root->fs_info->block_group_cache_lock); + bytenr = root->fs_info->first_logical_byte; + spin_unlock(&root->fs_info->block_group_cache_lock); + + if (bytenr < (u64)-1) + return bytenr; + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); if (!cache) return 0; @@ -4875,8 +5086,7 @@ int btrfs_pin_extent(struct btrfs_root *root, /* * this function must be called within transaction */ -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { struct btrfs_block_group_cache *cache; @@ -4890,7 +5100,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, * to one because the slow code to read in the free extents does check * the pinned extents. */ - cache_block_group(cache, trans, root, 1); + cache_block_group(cache, 1); pin_down_extent(root, cache, bytenr, num_bytes, 0); @@ -5287,7 +5497,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = update_block_group(trans, root, bytenr, num_bytes, 0); + ret = update_block_group(root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5332,7 +5542,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->extent_op) { if (!head->must_insert_reserved) goto out; - kfree(head->extent_op); + btrfs_free_delayed_extent_op(head->extent_op); head->extent_op = NULL; } @@ -5455,10 +5665,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ret; } -static u64 stripe_align(struct btrfs_root *root, u64 val) +static u64 stripe_align(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 val, u64 num_bytes) { - u64 mask = ((u64)root->stripesize - 1); - u64 ret = (val + mask) & ~mask; + u64 ret = ALIGN(val, root->stripesize); return ret; } @@ -5478,7 +5689,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, u64 num_bytes) { struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); caching_ctl = get_caching_control(cache); if (!caching_ctl) @@ -5495,7 +5705,6 @@ static noinline int wait_block_group_cache_done(struct btrfs_block_group_cache *cache) { struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); caching_ctl = get_caching_control(cache); if (!caching_ctl) @@ -5509,20 +5718,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) int __get_raid_index(u64 flags) { - int index; - if (flags & BTRFS_BLOCK_GROUP_RAID10) - index = 0; + return BTRFS_RAID_RAID10; else if (flags & BTRFS_BLOCK_GROUP_RAID1) - index = 1; + return BTRFS_RAID_RAID1; else if (flags & BTRFS_BLOCK_GROUP_DUP) - index = 2; + return BTRFS_RAID_DUP; else if (flags & BTRFS_BLOCK_GROUP_RAID0) - index = 3; - else - index = 4; + return BTRFS_RAID_RAID0; + else if (flags & BTRFS_BLOCK_GROUP_RAID5) + return BTRFS_RAID_RAID5; + else if (flags & BTRFS_BLOCK_GROUP_RAID6) + return BTRFS_RAID_RAID6; - return index; + return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } static int get_block_group_index(struct btrfs_block_group_cache *cache) @@ -5665,6 +5874,8 @@ search: if (!block_group_bits(block_group, data)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10; /* @@ -5680,8 +5891,7 @@ have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { found_uncached_bg = true; - ret = cache_block_group(block_group, trans, - orig_root, 0); + ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; } @@ -5694,6 +5904,7 @@ have_block_group: * lets look there */ if (last_ptr) { + unsigned long aligned_cluster; /* * the refill lock keeps out other * people trying to start a new cluster @@ -5760,11 +5971,15 @@ refill_cluster: goto unclustered_alloc; } + aligned_cluster = max_t(unsigned long, + empty_cluster + empty_size, + block_group->full_stripe_len); + /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, search_start, num_bytes, - empty_cluster + empty_size); + aligned_cluster); if (ret == 0) { /* * now pull our allocation out of this @@ -5835,7 +6050,8 @@ unclustered_alloc: goto loop; } checks: - search_start = stripe_align(root, offset); + search_start = stripe_align(root, used_block_group, + offset, num_bytes); /* move on to the next group */ if (search_start + num_bytes > @@ -5986,7 +6202,7 @@ again: if (ret == -ENOSPC) { if (!final_tried) { num_bytes = num_bytes >> 1; - num_bytes = num_bytes & ~(root->sectorsize - 1); + num_bytes = round_down(num_bytes, root->sectorsize); num_bytes = max(num_bytes, min_alloc_size); if (num_bytes == min_alloc_size) final_tried = true; @@ -6110,7 +6326,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); + ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -6174,7 +6390,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); + ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -6217,7 +6433,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 num_bytes = ins->offset; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, trans, NULL, 0); + cache_block_group(block_group, 0); caching_ctl = get_caching_control(block_group); if (!caching_ctl) { @@ -6331,12 +6547,14 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (!ret) return block_rsv; if (ret && !block_rsv->failfast) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - /*DEFAULT_RATELIMIT_BURST*/ 2); - if (__ratelimit(&_rs)) - WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", - ret); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "btrfs: block rsv returned %d\n", ret); + } ret = reserve_metadata_bytes(root, block_rsv, blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) { @@ -6402,7 +6620,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + extent_op = btrfs_alloc_delayed_extent_op(); BUG_ON(!extent_op); /* -ENOMEM */ if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); @@ -6524,7 +6742,7 @@ reada: } /* - * hepler to process tree block while walking down the tree. + * helper to process tree block while walking down the tree. * * when wc->stage == UPDATE_BACKREF, this function updates * back refs for pointers in the block. @@ -6599,7 +6817,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, } /* - * hepler to process tree block pointer. + * helper to process tree block pointer. * * when wc->stage == DROP_REFERENCE, this function checks * reference count of the block pointed to. if the block @@ -6737,7 +6955,7 @@ skip: } /* - * hepler to process tree block while walking up the tree. + * helper to process tree block while walking up the tree. * * when wc->stage == DROP_REFERENCE, this function drops * reference count on the block. @@ -7205,6 +7423,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) root->fs_info->fs_devices->missing_devices; stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; if (num_devices == 1) { @@ -7483,16 +7702,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) index = get_block_group_index(block_group); } - if (index == 0) { + if (index == BTRFS_RAID_RAID10) { dev_min = 4; /* Divide by 2 */ min_free >>= 1; - } else if (index == 1) { + } else if (index == BTRFS_RAID_RAID1) { dev_min = 2; - } else if (index == 2) { + } else if (index == BTRFS_RAID_DUP) { /* Multiply by 2 */ min_free <<= 1; - } else if (index == 3) { + } else if (index == BTRFS_RAID_RAID0) { dev_min = fs_devices->rw_devices; do_div(min_free, dev_min); } @@ -7653,11 +7872,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - if (space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0) { - WARN_ON(1); - dump_space_info(space_info, 0, 0); + if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { + if (space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0) { + WARN_ON(1); + dump_space_info(space_info, 0, 0); + } } list_del(&space_info->list); kfree(space_info); @@ -7756,7 +7977,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) btrfs_release_path(path); cache->flags = btrfs_block_group_flags(&cache->item); cache->sectorsize = root->sectorsize; - + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + found_key.objectid); btrfs_init_free_space_ctl(cache); /* @@ -7764,7 +7987,17 @@ int btrfs_read_block_groups(struct btrfs_root *root) * info has super bytes accounted for, otherwise we'll think * we have more space than we actually do. */ - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + goto error; + } /* * check for two cases, either we are full, and therefore @@ -7810,6 +8043,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) if (!(get_alloc_profile(root, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_DUP))) continue; /* @@ -7885,6 +8120,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; cache->sectorsize = root->sectorsize; cache->fs_info = root->fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + chunk_offset); atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); @@ -7901,7 +8139,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + return ret; + } add_new_free_space(cache, root->fs_info, chunk_offset, chunk_offset + size); @@ -7934,12 +8182,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 extra_flags = chunk_to_extended(flags) & BTRFS_EXTENDED_PROFILE_MASK; + write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits &= ~extra_flags; + write_sequnlock(&fs_info->profiles_lock); } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, @@ -8038,6 +8288,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + + if (root->fs_info->first_logical_byte == block_group->key.objectid) + root->fs_info->first_logical_byte = (u64)-1; spin_unlock(&root->fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); @@ -8160,7 +8413,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) if (end - start >= range->minlen) { if (!block_group_cache_done(cache)) { - ret = cache_block_group(cache, NULL, root, 0); + ret = cache_block_group(cache, 0); if (!ret) wait_block_group_cache_done(cache); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1b319df29eee..cdee391fc7bf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4,7 +4,6 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/page-flags.h> -#include <linux/module.h> #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/swap.h> @@ -1258,6 +1257,39 @@ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) GFP_NOFS); } +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + clear_page_dirty_for_io(page); + page_cache_release(page); + index++; + } + return 0; +} + +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + account_page_redirty(page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + return 0; +} + /* * helper function to set both pages and extents in the tree writeback */ @@ -1834,7 +1866,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, */ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); @@ -1846,7 +1878,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) */ static void check_page_locked(struct extent_io_tree *tree, struct page *page) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) unlock_page(page); @@ -1895,13 +1927,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, if (ret) err = ret; - if (did_repair) { - ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_DAMAGED, GFP_NOFS); - if (ret && !err) - err = ret; - } + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_DAMAGED, GFP_NOFS); + if (ret && !err) + err = ret; kfree(rec); return err; @@ -1932,10 +1962,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, u64 map_length = 0; u64 sector; struct btrfs_bio *bbio = NULL; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int ret; BUG_ON(!mirror_num); + /* we can't repair anything in raid56 yet */ + if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) + return 0; + bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; @@ -1960,7 +1995,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, return -EIO; } bio->bi_bdev = dev->bdev; - bio_add_page(bio, page, length, start-page_offset(page)); + bio_add_page(bio, page, length, start - page_offset(page)); btrfsic_submit_bio(WRITE_SYNC, bio); wait_for_completion(&compl); @@ -2052,6 +2087,7 @@ static int clean_io_failure(u64 start, struct page *page) failrec->failed_mirror); did_repair = !ret; } + ret = 0; } out: @@ -2293,8 +2329,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) struct page *page = bvec->bv_page; tree = &BTRFS_I(page->mapping->host)->io_tree; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; + start = page_offset(page) + bvec->bv_offset; end = start + bvec->bv_len - 1; if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) @@ -2353,8 +2388,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; + start = page_offset(page) + bvec->bv_offset; end = start + bvec->bv_len - 1; if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) @@ -2471,7 +2505,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, struct extent_io_tree *tree = bio->bi_private; u64 start; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; @@ -2489,13 +2523,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, return ret; } -static int merge_bio(struct extent_io_tree *tree, struct page *page, +static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { int ret = 0; if (tree->ops && tree->ops->merge_bio_hook) - ret = tree->ops->merge_bio_hook(page, offset, size, bio, + ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, bio_flags); BUG_ON(ret < 0); return ret; @@ -2530,7 +2564,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, sector; if (prev_bio_flags != bio_flags || !contig || - merge_bio(tree, page, offset, page_size, bio, bio_flags) || + merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); @@ -2595,7 +2629,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, unsigned long *bio_flags) { struct inode *inode = page->mapping->host; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; @@ -2648,6 +2682,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } } while (cur <= end) { + unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + if (cur >= last_byte) { char *userpage; struct extent_state *cached = NULL; @@ -2682,7 +2718,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) { disk_io_size = em->block_len; sector = em->block_start >> 9; @@ -2735,26 +2771,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree, continue; } - ret = 0; - if (tree->ops && tree->ops->readpage_io_hook) { - ret = tree->ops->readpage_io_hook(page, cur, - cur + iosize - 1); - } - if (!ret) { - unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; - pnr -= page->index; - ret = submit_extent_page(READ, tree, page, + pnr -= page->index; + ret = submit_extent_page(READ, tree, page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag); - if (!ret) { - nr++; - *bio_flags = this_bio_flag; - } - } - if (ret) { + if (!ret) { + nr++; + *bio_flags = this_bio_flag; + } else { SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); } @@ -2806,7 +2833,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct inode *inode = page->mapping->host; struct extent_page_data *epd = data; struct extent_io_tree *tree = epd->tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 delalloc_start; u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; @@ -2982,7 +3009,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + iosize = ALIGN(iosize, blocksize); sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; block_start = em->block_start; @@ -3124,12 +3151,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - spin_lock(&fs_info->delalloc_lock); - if (fs_info->dirty_metadata_bytes >= eb->len) - fs_info->dirty_metadata_bytes -= eb->len; - else - WARN_ON(1); - spin_unlock(&fs_info->delalloc_lock); + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -eb->len, + fs_info->dirty_metadata_batch); ret = 1; } else { spin_unlock(&eb->refs_lock); @@ -3446,15 +3470,9 @@ retry: * swizzled back from swapper_space to tmpfs file * mapping */ - if (tree->ops && - tree->ops->write_cache_pages_lock_hook) { - tree->ops->write_cache_pages_lock_hook(page, - data, flush_fn); - } else { - if (!trylock_page(page)) { - flush_fn(data); - lock_page(page); - } + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); } if (unlikely(page->mapping != mapping)) { @@ -3674,11 +3692,11 @@ int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset) { struct extent_state *cached_state = NULL; - u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; - start += (offset + blocksize - 1) & ~(blocksize - 1); + start += ALIGN(offset, blocksize); if (start > end) return 0; @@ -3700,7 +3718,7 @@ int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; int ret = 1; @@ -3739,7 +3757,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, gfp_t mask) { struct extent_map *em; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if ((mask & __GFP_WAIT) && @@ -3797,7 +3815,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, len = last - offset; if (len == 0) break; - len = (len + sectorsize - 1) & ~(sectorsize - 1); + len = ALIGN(len, sectorsize); em = get_extent(inode, NULL, 0, offset, len, 0); if (IS_ERR_OR_NULL(em)) return em; @@ -3995,8 +4013,6 @@ static void __free_extent_buffer(struct extent_buffer *eb) list_del(&eb->leak_list); spin_unlock_irqrestore(&leak_lock, flags); #endif - if (eb->pages && eb->pages != eb->inline_pages) - kfree(eb->pages); kmem_cache_free(extent_buffer_cache, eb); } @@ -4037,19 +4053,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, atomic_set(&eb->refs, 1); atomic_set(&eb->io_pages, 0); - if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { - struct page **pages; - int num_pages = (len + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - pages = kzalloc(num_pages, mask); - if (!pages) { - __free_extent_buffer(eb); - return NULL; - } - eb->pages = pages; - } else { - eb->pages = eb->inline_pages; - } + /* + * Sanity checks, currently the maximum is 64k covered by 16x 4k pages + */ + BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE + > MAX_INLINE_EXTENT_BUFFER_SIZE); + BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); return eb; } @@ -4180,6 +4189,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) static void check_buffer_tree_ref(struct extent_buffer *eb) { + int refs; /* the ref bit is tricky. We have to make sure it is set * if we have the buffer dirty. Otherwise the * code to free a buffer can end up dropping a dirty @@ -4200,6 +4210,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * So bump the ref count first, then set the bit. If someone * beat us to it, drop the ref we added. */ + refs = atomic_read(&eb->refs); + if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + return; + spin_lock(&eb->refs_lock); if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) atomic_inc(&eb->refs); @@ -4401,9 +4415,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) void free_extent_buffer(struct extent_buffer *eb) { + int refs; + int old; if (!eb) return; + while (1) { + refs = atomic_read(&eb->refs); + if (refs <= 3) + break; + old = atomic_cmpxchg(&eb->refs, refs, refs - 1); + if (old == refs) + return; + } + spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2eacfabd3263..258c92156857 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -72,10 +72,9 @@ struct extent_io_ops { int (*writepage_start_hook)(struct page *page, u64 start, u64 end); int (*writepage_io_hook)(struct page *page, u64 start, u64 end); extent_submit_bio_hook_t *submit_bio_hook; - int (*merge_bio_hook)(struct page *page, unsigned long offset, + int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); - int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int mirror); @@ -90,8 +89,6 @@ struct extent_io_ops { struct extent_state *other); void (*split_extent_hook)(struct inode *inode, struct extent_state *orig, u64 split); - int (*write_cache_pages_lock_hook)(struct page *page, void *data, - void (*flush_fn)(void *)); }; struct extent_io_tree { @@ -161,8 +158,7 @@ struct extent_buffer { */ wait_queue_head_t read_lock_wq; wait_queue_head_t lock_wq; - struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; - struct page **pages; + struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; }; static inline void extent_set_compress_type(unsigned long *bio_flags, @@ -329,6 +325,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long *map_len); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end); +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2e8cae63d247..2834ca5768ea 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1,6 +1,5 @@ #include <linux/err.h> #include <linux/slab.h> -#include <linux/module.h> #include <linux/spinlock.h> #include <linux/hardirq.h> #include "ctree.h" @@ -288,7 +287,8 @@ out: void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { clear_bit(EXTENT_FLAG_LOGGING, &em->flags); - try_merge_map(tree, em); + if (em->in_tree) + try_merge_map(tree, em); } /** diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 94aa53b38721..c4628a201cb3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -118,9 +118,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); csums_in_item /= csum_size; - if (csum_offset >= csums_in_item) { + if (csum_offset == csums_in_item) { ret = -EFBIG; goto fail; + } else if (csum_offset > csums_in_item) { + goto fail; } } item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); @@ -684,6 +686,24 @@ out: return ret; } +static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums, + struct btrfs_sector_sum *sector_sum, + u64 total_bytes, u64 sectorsize) +{ + u64 tmp = sectorsize; + u64 next_sector = sector_sum->bytenr; + struct btrfs_sector_sum *next = sector_sum + 1; + + while ((tmp + total_bytes) < sums->len) { + if (next_sector + sectorsize != next->bytenr) + break; + tmp += sectorsize; + next_sector = next->bytenr; + next++; + } + return tmp; +} + int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) @@ -710,7 +730,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, return -ENOMEM; sector_sum = sums->sums; - trans->adding_csums = 1; again: next_offset = (u64)-1; found_next = 0; @@ -789,20 +808,32 @@ again: goto insert; } - if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / + if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) / csum_size) { - u32 diff = (csum_offset + 1) * csum_size; + int extend_nr; + u64 tmp; + u32 diff; + u32 free_space; - /* - * is the item big enough already? we dropped our lock - * before and need to recheck - */ - if (diff < btrfs_item_size_nr(leaf, path->slots[0])) - goto csum; + if (btrfs_leaf_free_space(root, leaf) < + sizeof(struct btrfs_item) + csum_size * 2) + goto insert; + + free_space = btrfs_leaf_free_space(root, leaf) - + sizeof(struct btrfs_item) - csum_size; + tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, + root->sectorsize); + tmp >>= root->fs_info->sb->s_blocksize_bits; + WARN_ON(tmp < 1); + + extend_nr = max_t(int, 1, (int)tmp); + diff = (csum_offset + extend_nr) * csum_size; + diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size); diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); - if (diff != csum_size) - goto insert; + diff = min(free_space, diff); + diff /= csum_size; + diff *= csum_size; btrfs_extend_item(trans, root, path, diff); goto csum; @@ -812,19 +843,14 @@ insert: btrfs_release_path(path); csum_offset = 0; if (found_next) { - u64 tmp = total_bytes + root->sectorsize; - u64 next_sector = sector_sum->bytenr; - struct btrfs_sector_sum *next = sector_sum + 1; + u64 tmp; - while (tmp < sums->len) { - if (next_sector + root->sectorsize != next->bytenr) - break; - tmp += root->sectorsize; - next_sector = next->bytenr; - next++; - } - tmp = min(tmp, next_offset - file_key.offset); + tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, + root->sectorsize); tmp >>= root->fs_info->sb->s_blocksize_bits; + tmp = min(tmp, (next_offset - file_key.offset) >> + root->fs_info->sb->s_blocksize_bits); + tmp = max((u64)1, tmp); tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); ins_size = csum_size * tmp; @@ -874,7 +900,6 @@ next_sector: goto again; } out: - trans->adding_csums = 0; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f76b1fd160d4..ade03e6f7bd2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -30,11 +30,11 @@ #include <linux/statfs.h> #include <linux/compat.h> #include <linux/slab.h> +#include <linux/btrfs.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "tree-log.h" #include "locking.h" @@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; + int index; + int ret; /* get the inode */ key.objectid = defrag->root; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(inode_root)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode_root); + ret = PTR_ERR(inode_root); + goto cleanup; + } + if (btrfs_root_refs(&inode_root->root_item) == 0) { + ret = -ENOENT; + goto cleanup; } key.objectid = defrag->ino; @@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode); + ret = PTR_ERR(inode); + goto cleanup; } + srcu_read_unlock(&fs_info->subvol_srcu, index); /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, iput(inode); return 0; +cleanup: + srcu_read_unlock(&fs_info->subvol_srcu, index); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; } /* @@ -360,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) atomic_inc(&fs_info->defrag_running); while(1) { + /* Pause the auto defragger. */ + if (test_bit(BTRFS_FS_STATE_REMOUNTING, + &fs_info->fs_state)) + break; + if (!__need_auto_defrag(fs_info->tree_root)) break; @@ -491,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, loff_t isize = i_size_read(inode); start_pos = pos & ~((u64)root->sectorsize - 1); - num_bytes = (write_bytes + pos - start_pos + - root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, @@ -573,6 +591,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &flags); remove_extent_mapping(em_tree, em); if (no_splits) goto next; @@ -1211,7 +1230,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, struct extent_state *cached_state = NULL; int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; int faili = 0; @@ -1298,7 +1317,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct iov_iter *i, loff_t pos) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; unsigned long first_index; @@ -1486,7 +1505,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; loff_t *ppos = &iocb->ki_pos; u64 start_pos; @@ -1530,7 +1549,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, * although we have opened a file as writable, we have * to stop this write operation to ensure FS consistency. */ - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { mutex_unlock(&inode->i_mutex); err = -EROFS; goto out; @@ -1594,9 +1613,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (err < 0 && num_written > 0) num_written = err; } -out: + if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); +out: sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; @@ -1612,7 +1632,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp) */ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags)) { - btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* + * We need to block on a committing transaction to keep us from + * throwing a ordered operation on to the list and causing + * something like sync to deadlock trying to flush out this + * inode. + */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); + btrfs_end_transaction(trans, root); if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(inode->i_mapping); } @@ -1639,16 +1672,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; struct btrfs_trans_handle *trans; + bool full_sync = 0; trace_btrfs_sync_file(file, datasync); /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by - * multi-task, and make the performance up. + * multi-task, and make the performance up. See + * btrfs_wait_ordered_range for an explanation of the ASYNC check. */ atomic_inc(&BTRFS_I(inode)->sync_writers); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); atomic_dec(&BTRFS_I(inode)->sync_writers); if (ret) return ret; @@ -1660,7 +1698,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * range being left. */ atomic_inc(&root->log_batch); - btrfs_wait_ordered_range(inode, start, end - start + 1); + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + if (full_sync) + btrfs_wait_ordered_range(inode, start, end - start + 1); atomic_inc(&root->log_batch); /* @@ -1727,13 +1768,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret != BTRFS_NO_LOG_SYNC) { if (ret > 0) { + /* + * If we didn't already wait for ordered extents we need + * to do that now. + */ + if (!full_sync) + btrfs_wait_ordered_range(inode, start, + end - start + 1); ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_sync_log(trans, root); - if (ret == 0) + if (ret == 0) { ret = btrfs_end_transaction(trans, root); - else + } else { + if (!full_sync) + btrfs_wait_ordered_range(inode, start, + end - + start + 1); ret = btrfs_commit_transaction(trans, root); + } } } else { ret = btrfs_end_transaction(trans, root); @@ -2087,8 +2140,9 @@ out: static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -2116,6 +2170,11 @@ static long btrfs_fallocate(struct file *file, int mode, ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) return ret; + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); + if (ret) + goto out_reserve_fail; + } /* * wait for ordered IO before we have any locks. We'll loop again @@ -2219,6 +2278,9 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); + if (root->fs_info->quota_enabled) + btrfs_qgroup_free(root, alloc_end - alloc_start); +out_reserve_fail: /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0be7a8742a43..1f84fc09c1a8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); + max_bitmaps = max(max_bitmaps, 1); + BUG_ON(ctl->total_bitmaps > max_bitmaps); /* @@ -1463,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, } static struct btrfs_free_space * -find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) +find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, + unsigned long align) { struct btrfs_free_space *entry; struct rb_node *node; + u64 ctl_off; + u64 tmp; + u64 align_off; int ret; if (!ctl->free_space_offset.rb_node) @@ -1481,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) if (entry->bytes < *bytes) continue; + /* make sure the space returned is big enough + * to match our requested alignment + */ + if (*bytes >= align) { + ctl_off = entry->offset - ctl->start; + tmp = ctl_off + align - 1;; + do_div(tmp, align); + tmp = tmp * align + ctl->start; + align_off = tmp - entry->offset; + } else { + align_off = 0; + tmp = entry->offset; + } + + if (entry->bytes < *bytes + align_off) + continue; + if (entry->bitmap) { - ret = search_bitmap(ctl, entry, offset, bytes); - if (!ret) + ret = search_bitmap(ctl, entry, &tmp, bytes); + if (!ret) { + *offset = tmp; return entry; + } continue; } - *offset = entry->offset; - *bytes = entry->bytes; + *offset = tmp; + *bytes = entry->bytes - align_off; return entry; } @@ -1636,10 +1661,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, } /* - * some block groups are so tiny they can't be enveloped by a bitmap, so - * don't even bother to create a bitmap for this + * The original block groups from mkfs can be really small, like 8 + * megabytes, so don't bother with a bitmap for those entries. However + * some block groups can be smaller than what a bitmap would cover but + * are still large enough that they could overflow the 32k memory limit, + * so allow those block groups to still be allowed to have a bitmap + * entry. */ - if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) + if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) return false; return true; @@ -2095,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry = NULL; u64 bytes_search = bytes + empty_size; u64 ret = 0; + u64 align_gap = 0; + u64 align_gap_len = 0; spin_lock(&ctl->tree_lock); - entry = find_free_space(ctl, &offset, &bytes_search); + entry = find_free_space(ctl, &offset, &bytes_search, + block_group->full_stripe_len); if (!entry) goto out; @@ -2107,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, if (!entry->bytes) free_bitmap(ctl, entry); } else { + unlink_free_space(ctl, entry); - entry->offset += bytes; - entry->bytes -= bytes; + align_gap_len = offset - entry->offset; + align_gap = entry->offset; + + entry->offset = offset + bytes; + WARN_ON(entry->bytes < bytes + align_gap_len); + + entry->bytes -= bytes + align_gap_len; if (!entry->bytes) kmem_cache_free(btrfs_free_space_cachep, entry); else @@ -2119,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, out: spin_unlock(&ctl->tree_lock); + if (align_gap_len) + __btrfs_add_free_space(ctl, align_gap, align_gap_len); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cc93b23ca352..09c58a35b429 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -39,12 +39,13 @@ #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/mount.h> +#include <linux/btrfs.h> +#include <linux/blkdev.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "ordered-data.h" #include "xattr.h" @@ -54,6 +55,7 @@ #include "locking.h" #include "free-space-cache.h" #include "inode-map.h" +#include "backref.h" struct btrfs_iget_args { u64 ino; @@ -231,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, u64 isize = i_size_read(inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; - u64 aligned_end = (end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + u64 aligned_end = ALIGN(end, root->sectorsize); u64 data_len = inline_len; int ret; @@ -265,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; @@ -351,6 +353,7 @@ static noinline int compress_file_range(struct inode *inode, int i; int will_compress; int compress_type = root->fs_info->compress_type; + int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ if ((end - start + 1) < 16 * 1024 && @@ -389,7 +392,7 @@ again: * a compressed extent to 128k. */ total_compressed = min(total_compressed, max_uncompressed); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); total_in = 0; ret = 0; @@ -413,6 +416,17 @@ again: if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; + /* + * we need to call clear_page_dirty_for_io on each + * page in the range. Otherwise applications with the file + * mmap'd can wander in and change the page contents while + * we are compressing them. + * + * If the compression fails for any reason, we set the pages + * dirty again later on. + */ + extent_range_clear_dirty_for_io(inode, start, end); + redirty = 1; ret = btrfs_compress_pages(compress_type, inode->i_mapping, start, total_compressed, pages, @@ -488,15 +502,13 @@ cont: * up to a block size boundary so the allocator does sane * things */ - total_compressed = (total_compressed + blocksize - 1) & - ~(blocksize - 1); + total_compressed = ALIGN(total_compressed, blocksize); /* * one last check to make sure the compression is really a * win, compare the page count read with the blocks on disk */ - total_in = (total_in + PAGE_CACHE_SIZE - 1) & - ~(PAGE_CACHE_SIZE - 1); + total_in = ALIGN(total_in, PAGE_CACHE_SIZE); if (total_compressed >= total_in) { will_compress = 0; } else { @@ -554,6 +566,8 @@ cleanup_and_bail_uncompressed: __set_page_dirty_nobuffers(locked_page); /* unlocked later on in the async handlers */ } + if (redirty) + extent_range_redirty_for_io(inode, start, end); add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; @@ -608,7 +622,7 @@ static noinline int submit_compressed_extents(struct inode *inode, if (list_empty(&async_cow->extents)) return 0; - +again: while (!list_empty(&async_cow->extents)) { async_extent = list_entry(async_cow->extents.next, struct async_extent, list); @@ -648,6 +662,8 @@ retry: async_extent->ram_size - 1, btrfs_get_extent, WB_SYNC_ALL); + else if (ret) + unlock_page(async_cow->locked_page); kfree(async_extent); cond_resched(); continue; @@ -672,6 +688,7 @@ retry: if (ret) { int i; + for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); page_cache_release(async_extent->pages[i]); @@ -679,12 +696,10 @@ retry: kfree(async_extent->pages); async_extent->nr_pages = 0; async_extent->pages = NULL; - unlock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1); + if (ret == -ENOSPC) goto retry; - goto out_free; /* JDM: Requeue? */ + goto out_free; } /* @@ -696,10 +711,13 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(); - BUG_ON(!em); /* -ENOMEM */ + if (!em) + goto out_free_reserve; em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; + em->mod_start = em->start; + em->mod_len = em->len; em->block_start = ins.objectid; em->block_len = ins.offset; @@ -726,6 +744,9 @@ retry: async_extent->ram_size - 1, 0); } + if (ret) + goto out_free_reserve; + ret = btrfs_add_ordered_extent_compress(inode, async_extent->start, ins.objectid, @@ -733,7 +754,8 @@ retry: ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto out_free_reserve; /* * clear dirty, set writeback and unlock the pages. @@ -754,18 +776,30 @@ retry: ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages); - - BUG_ON(ret); /* -ENOMEM */ alloc_hint = ins.objectid + ins.offset; kfree(async_extent); + if (ret) + goto out; cond_resched(); } ret = 0; out: return ret; +out_free_reserve: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); out_free: + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); kfree(async_extent); - goto out; + goto again; } static u64 get_extent_allocation_hint(struct inode *inode, u64 start, @@ -834,7 +868,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, BUG_ON(btrfs_is_free_space_inode(inode)); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; @@ -892,6 +926,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, em->orig_start = em->start; ram_size = ins.offset; em->len = ins.offset; + em->mod_start = em->start; + em->mod_len = em->len; em->block_start = ins.objectid; em->block_len = ins.offset; @@ -1338,6 +1374,8 @@ out_check: em->block_start = disk_bytenr; em->orig_block_len = disk_num_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->mod_start = em->start; + em->mod_len = em->len; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_FILLING, &em->flags); em->generation = -1; @@ -1508,14 +1546,22 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } - spin_lock(&root->fs_info->delalloc_lock); + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, + root->fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; - root->fs_info->delalloc_bytes += len; - if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->fs_info->delalloc_inodes); + if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->fs_info->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + } + spin_unlock(&root->fs_info->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&BTRFS_I(inode)->lock); } } @@ -1550,15 +1596,22 @@ static void btrfs_clear_bit_hook(struct inode *inode, && do_list) btrfs_free_reserved_data_space(inode, len); - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->delalloc_bytes -= len; + __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, + root->fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes -= len; - if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && - !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); + test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + } + spin_unlock(&root->fs_info->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&BTRFS_I(inode)->lock); } } @@ -1566,7 +1619,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, * extent_io.c merge_bio_hook, this must check the chunk tree to make sure * we don't create bios that span stripes or chunks */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { @@ -1581,7 +1634,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, length = bio->bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, READ, logical, + ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, NULL, 0); /* Will always return 0 with map_multi == NULL */ BUG_ON(ret < 0); @@ -1704,8 +1757,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sum; list_for_each_entry(sum, list, list) { + trans->adding_csums = 1; btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); + trans->adding_csums = 0; } return 0; } @@ -1892,6 +1947,643 @@ out: return ret; } +/* snapshot-aware defrag */ +struct sa_defrag_extent_backref { + struct rb_node node; + struct old_sa_defrag_extent *old; + u64 root_id; + u64 inum; + u64 file_pos; + u64 extent_offset; + u64 num_bytes; + u64 generation; +}; + +struct old_sa_defrag_extent { + struct list_head list; + struct new_sa_defrag_extent *new; + + u64 extent_offset; + u64 bytenr; + u64 offset; + u64 len; + int count; +}; + +struct new_sa_defrag_extent { + struct rb_root root; + struct list_head head; + struct btrfs_path *path; + struct inode *inode; + u64 file_pos; + u64 len; + u64 bytenr; + u64 disk_len; + u8 compress_type; +}; + +static int backref_comp(struct sa_defrag_extent_backref *b1, + struct sa_defrag_extent_backref *b2) +{ + if (b1->root_id < b2->root_id) + return -1; + else if (b1->root_id > b2->root_id) + return 1; + + if (b1->inum < b2->inum) + return -1; + else if (b1->inum > b2->inum) + return 1; + + if (b1->file_pos < b2->file_pos) + return -1; + else if (b1->file_pos > b2->file_pos) + return 1; + + /* + * [------------------------------] ===> (a range of space) + * |<--->| |<---->| =============> (fs/file tree A) + * |<---------------------------->| ===> (fs/file tree B) + * + * A range of space can refer to two file extents in one tree while + * refer to only one file extent in another tree. + * + * So we may process a disk offset more than one time(two extents in A) + * and locate at the same extent(one extent in B), then insert two same + * backrefs(both refer to the extent in B). + */ + return 0; +} + +static void backref_insert(struct rb_root *root, + struct sa_defrag_extent_backref *backref) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sa_defrag_extent_backref *entry; + int ret; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct sa_defrag_extent_backref, node); + + ret = backref_comp(backref, entry); + if (ret < 0) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&backref->node, parent, p); + rb_insert_color(&backref->node, root); +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, + void *ctx) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_fs_info *fs_info; + struct old_sa_defrag_extent *old = ctx; + struct new_sa_defrag_extent *new = old->new; + struct btrfs_path *path = new->path; + struct btrfs_key key; + struct btrfs_root *root; + struct sa_defrag_extent_backref *backref; + struct extent_buffer *leaf; + struct inode *inode = new->inode; + int slot; + int ret; + u64 extent_offset; + u64 num_bytes; + + if (BTRFS_I(inode)->root->root_key.objectid == root_id && + inum == btrfs_ino(inode)) + return 0; + + key.objectid = root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(inode)->root->fs_info; + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + if (PTR_ERR(root) == -ENOENT) + return 0; + WARN_ON(1); + pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", + inum, offset, root_id); + return PTR_ERR(root); + } + + key.objectid = inum; + key.type = BTRFS_EXTENT_DATA_KEY; + if (offset > (u64)-1 << 32) + key.offset = 0; + else + key.offset = offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + WARN_ON(1); + return ret; + } + + while (1) { + cond_resched(); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + continue; + } + + path->slots[0]++; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid > inum) + goto out; + + if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) + continue; + + extent_offset = btrfs_file_extent_offset(leaf, extent); + if (key.offset - extent_offset != offset) + continue; + + num_bytes = btrfs_file_extent_num_bytes(leaf, extent); + if (extent_offset >= old->extent_offset + old->offset + + old->len || extent_offset + num_bytes <= + old->extent_offset + old->offset) + continue; + + break; + } + + backref = kmalloc(sizeof(*backref), GFP_NOFS); + if (!backref) { + ret = -ENOENT; + goto out; + } + + backref->root_id = root_id; + backref->inum = inum; + backref->file_pos = offset + extent_offset; + backref->num_bytes = num_bytes; + backref->extent_offset = extent_offset; + backref->generation = btrfs_file_extent_generation(leaf, extent); + backref->old = old; + backref_insert(&new->root, backref); + old->count++; +out: + btrfs_release_path(path); + WARN_ON(ret); + return ret; +} + +static noinline bool record_extent_backrefs(struct btrfs_path *path, + struct new_sa_defrag_extent *new) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; + struct old_sa_defrag_extent *old, *tmp; + int ret; + + new->path = path; + + list_for_each_entry_safe(old, tmp, &new->head, list) { + ret = iterate_inodes_from_logical(old->bytenr, fs_info, + path, record_one_backref, + old); + BUG_ON(ret < 0 && ret != -ENOENT); + + /* no backref to be processed for this extent */ + if (!old->count) { + list_del(&old->list); + kfree(old); + } + } + + if (list_empty(&new->head)) + return false; + + return true; +} + +static int relink_is_mergable(struct extent_buffer *leaf, + struct btrfs_file_extent_item *fi, + u64 disk_bytenr) +{ + if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr) + return 0; + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + return 1; +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int relink_extent_backref(struct btrfs_path *path, + struct sa_defrag_extent_backref *prev, + struct sa_defrag_extent_backref *backref) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_file_extent_item *item; + struct btrfs_ordered_extent *ordered; + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct old_sa_defrag_extent *old = backref->old; + struct new_sa_defrag_extent *new = old->new; + struct inode *src_inode = new->inode; + struct inode *inode; + struct extent_state *cached = NULL; + int ret = 0; + u64 start; + u64 len; + u64 lock_start; + u64 lock_end; + bool merge = false; + int index; + + if (prev && prev->root_id == backref->root_id && + prev->inum == backref->inum && + prev->file_pos + prev->num_bytes == backref->file_pos) + merge = true; + + /* step 1: get root */ + key.objectid = backref->root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(src_inode)->root->fs_info; + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + if (PTR_ERR(root) == -ENOENT) + return 0; + return PTR_ERR(root); + } + if (btrfs_root_refs(&root->root_item) == 0) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + /* parse ENOENT to 0 */ + return 0; + } + + /* step 2: get inode */ + key.objectid = backref->inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + return 0; + } + + srcu_read_unlock(&fs_info->subvol_srcu, index); + + /* step 3: relink backref */ + lock_start = backref->file_pos; + lock_end = backref->file_pos + backref->num_bytes - 1; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + 0, &cached); + + ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); + if (ordered) { + btrfs_put_ordered_extent(ordered); + goto out_unlock; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + key.objectid = backref->inum; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = backref->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out_free_path; + } else if (ret > 0) { + ret = 0; + goto out_free_path; + } + + extent = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_generation(path->nodes[0], extent) != + backref->generation) + goto out_free_path; + + btrfs_release_path(path); + + start = backref->file_pos; + if (backref->extent_offset < old->extent_offset + old->offset) + start += old->extent_offset + old->offset - + backref->extent_offset; + + len = min(backref->extent_offset + backref->num_bytes, + old->extent_offset + old->offset + old->len); + len -= max(backref->extent_offset, old->extent_offset + old->offset); + + ret = btrfs_drop_extents(trans, root, inode, start, + start + len, 1); + if (ret) + goto out_free_path; +again: + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + path->leave_spinning = 1; + if (merge) { + struct btrfs_file_extent_item *fi; + u64 extent_len; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + if (ret < 0) + goto out_free_path; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + + if (relink_is_mergable(leaf, fi, new->bytenr) && + extent_len + found_key.offset == start) { + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_len + len); + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + + ret = 1; + goto out_free_path; + } else { + merge = false; + btrfs_release_path(path); + goto again; + } + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); + btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); + btrfs_set_file_extent_num_bytes(leaf, item, len); + btrfs_set_file_extent_ram_bytes(leaf, item, new->len); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, new->compress_type); + btrfs_set_file_extent_encryption(leaf, item, 0); + btrfs_set_file_extent_other_encoding(leaf, item, 0); + + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + btrfs_release_path(path); + + ret = btrfs_inc_extent_ref(trans, root, new->bytenr, + new->disk_len, 0, + backref->root_id, backref->inum, + new->file_pos, 0); /* start - extent_offset */ + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + ret = 1; +out_free_path: + btrfs_release_path(path); + path->leave_spinning = 0; + btrfs_end_transaction(trans, root); +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + &cached, GFP_NOFS); + iput(inode); + return ret; +} + +static void relink_file_extents(struct new_sa_defrag_extent *new) +{ + struct btrfs_path *path; + struct old_sa_defrag_extent *old, *tmp; + struct sa_defrag_extent_backref *backref; + struct sa_defrag_extent_backref *prev = NULL; + struct inode *inode; + struct btrfs_root *root; + struct rb_node *node; + int ret; + + inode = new->inode; + root = BTRFS_I(inode)->root; + + path = btrfs_alloc_path(); + if (!path) + return; + + if (!record_extent_backrefs(path, new)) { + btrfs_free_path(path); + goto out; + } + btrfs_release_path(path); + + while (1) { + node = rb_first(&new->root); + if (!node) + break; + rb_erase(node, &new->root); + + backref = rb_entry(node, struct sa_defrag_extent_backref, node); + + ret = relink_extent_backref(path, prev, backref); + WARN_ON(ret < 0); + + kfree(prev); + + if (ret == 1) + prev = backref; + else + prev = NULL; + cond_resched(); + } + kfree(prev); + + btrfs_free_path(path); + + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } +out: + atomic_dec(&root->fs_info->defrag_running); + wake_up(&root->fs_info->transaction_wait); + + kfree(new); +} + +static struct new_sa_defrag_extent * +record_old_file_extents(struct inode *inode, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct old_sa_defrag_extent *old, *tmp; + struct new_sa_defrag_extent *new; + int ret; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return NULL; + + new->inode = inode; + new->file_pos = ordered->file_offset; + new->len = ordered->len; + new->bytenr = ordered->start; + new->disk_len = ordered->disk_len; + new->compress_type = ordered->compress_type; + new->root = RB_ROOT; + INIT_LIST_HEAD(&new->head); + + path = btrfs_alloc_path(); + if (!path) + goto out_kfree; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = new->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out_free_path; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + /* find out all the old extents for the file range */ + while (1) { + struct btrfs_file_extent_item *extent; + struct extent_buffer *l; + int slot; + u64 num_bytes; + u64 offset; + u64 end; + u64 disk_bytenr; + u64 extent_offset; + + l = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out_free_list; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid != btrfs_ino(inode)) + break; + if (key.type != BTRFS_EXTENT_DATA_KEY) + break; + if (key.offset >= new->file_pos + new->len) + break; + + extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); + + num_bytes = btrfs_file_extent_num_bytes(l, extent); + if (key.offset + num_bytes < new->file_pos) + goto next; + + disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); + if (!disk_bytenr) + goto next; + + extent_offset = btrfs_file_extent_offset(l, extent); + + old = kmalloc(sizeof(*old), GFP_NOFS); + if (!old) + goto out_free_list; + + offset = max(new->file_pos, key.offset); + end = min(new->file_pos + new->len, key.offset + num_bytes); + + old->bytenr = disk_bytenr; + old->extent_offset = extent_offset; + old->offset = offset - key.offset; + old->len = end - offset; + old->new = new; + old->count = 0; + list_add_tail(&old->list, &new->head); +next: + path->slots[0]++; + cond_resched(); + } + + btrfs_free_path(path); + atomic_inc(&root->fs_info->defrag_running); + + return new; + +out_free_list: + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } +out_free_path: + btrfs_free_path(path); +out_kfree: + kfree(new); + return NULL; +} + /* * helper function for btrfs_finish_ordered_io, this * just reads in some of the csum leaves to prime them into ram @@ -1909,6 +2601,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; + struct new_sa_defrag_extent *new = NULL; int compress_type = 0; int ret; bool nolock; @@ -1943,6 +2636,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset + ordered_extent->len - 1, 0, &cached_state); + ret = test_range_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 1, cached_state); + if (ret) { + u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); + if (last_snapshot >= BTRFS_I(inode)->generation) + /* the inode is shared */ + new = record_old_file_extents(inode, ordered_extent); + + clear_extent_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); + } + if (nolock) trans = btrfs_join_transaction_nolock(root); else @@ -2001,17 +2708,33 @@ out: if (trans) btrfs_end_transaction(trans, root); - if (ret) + if (ret) { clear_extent_uptodate(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, NULL, GFP_NOFS); + /* + * If the ordered extent had an IOERR or something else went + * wrong we need to return the space for this ordered extent + * back to the allocator. + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) + btrfs_free_reserved_extent(root, ordered_extent->start, + ordered_extent->disk_len); + } + + /* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ btrfs_remove_ordered_extent(inode, ordered_extent); + /* for snapshot-aware defrag */ + if (new) + relink_file_extents(new); + /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ @@ -2062,7 +2785,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int mirror) { - size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); + size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; char *kaddr; @@ -2167,11 +2890,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) } } -enum btrfs_orphan_cleanup_state { - ORPHAN_CLEANUP_STARTED = 1, - ORPHAN_CLEANUP_DONE = 2, -}; - /* * This is called in transaction commit time. If there are no orphan * files in the subvolume, it removes orphan item and frees block_rsv @@ -2469,6 +3187,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) */ set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags); + atomic_inc(&root->orphan_inodes); /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { @@ -2491,6 +3210,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) goto out; ret = btrfs_truncate(inode); + if (ret) + btrfs_orphan_del(NULL, inode); } else { nr_unlink++; } @@ -2709,34 +3430,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); - btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); - btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); - btrfs_set_inode_mode(leaf, item, inode->i_mode); - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); + btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, + &token); + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_nsec); + btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec, &token); - btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_nsec); + btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec, &token); - btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_nsec); + btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec, &token); - btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); - btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); - btrfs_set_inode_sequence(leaf, item, inode->i_version); - btrfs_set_inode_transid(leaf, item, trans->transid); - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); - btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, 0); + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), + &token); + btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, + &token); + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); + btrfs_set_token_inode_block_group(leaf, item, 0, &token); } /* @@ -2967,11 +3695,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, * 1 for the dir item * 1 for the dir index * 1 for the inode ref - * 1 for the inode ref in the tree log - * 2 for the dir entries in the log * 1 for the inode */ - trans = btrfs_start_transaction(root, 8); + trans = btrfs_start_transaction(root, 5); if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; @@ -3304,7 +4030,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 mask = root->sectorsize - 1; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -3328,7 +4053,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * extent just the way it is. */ if (root->ref_cows || root == root->fs_info->tree_root) - btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); + btrfs_drop_extent_cache(inode, ALIGN(new_size, + root->sectorsize), (u64)-1, 0); /* * This function is also used to drop the items in the log tree before @@ -3407,10 +4133,9 @@ search_again: if (!del_item) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - extent_num_bytes = new_size - - found_key.offset + root->sectorsize - 1; - extent_num_bytes = extent_num_bytes & - ~((u64)root->sectorsize - 1); + extent_num_bytes = ALIGN(new_size - + found_key.offset, + root->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - @@ -3646,9 +4371,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - u64 mask = root->sectorsize - 1; - u64 hole_start = (oldsize + mask) & ~mask; - u64 block_end = (size + mask) & ~mask; + u64 hole_start = ALIGN(oldsize, root->sectorsize); + u64 block_end = ALIGN(size, root->sectorsize); u64 last_byte; u64 cur_offset; u64 hole_size; @@ -3681,7 +4405,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) break; } last_byte = min(extent_map_end(em), block_end); - last_byte = (last_byte + mask) & ~mask; + last_byte = ALIGN(last_byte , root->sectorsize); if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; hole_size = last_byte - cur_offset; @@ -3832,6 +4556,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); + + /* Disable nonlocked read DIO to avoid the end less truncate */ + btrfs_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + btrfs_inode_resume_unlocked_dio(inode); + ret = btrfs_truncate(inode); if (ret && inode->i_nlink) btrfs_orphan_del(NULL, inode); @@ -3904,6 +4634,12 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + ret = btrfs_commit_inode_delayed_inode(inode); + if (ret) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { btrfs_orphan_del(NULL, inode); @@ -3941,7 +4677,7 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - trans = btrfs_start_transaction_lflush(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); @@ -3955,9 +4691,6 @@ void btrfs_evict_inode(struct inode *inode) break; trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - btrfs_end_transaction(trans, root); trans = NULL; btrfs_btree_balance_dirty(root); @@ -4391,7 +5124,7 @@ unsigned char btrfs_filetype_table[] = { static int btrfs_real_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_item *item; struct btrfs_dir_item *di; @@ -4854,7 +5587,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(root, NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } insert_inode_hash(inode); @@ -5006,12 +5740,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_update_inode(trans, root, inode); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -5396,8 +6124,7 @@ again: } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_inline_len(leaf, item); - extent_end = (extent_start + size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + extent_end = ALIGN(extent_start + size, root->sectorsize); } if (start >= extent_end) { @@ -5469,8 +6196,7 @@ again: copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, size - extent_offset); em->start = extent_start + extent_offset; - em->len = (copy_size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; if (compress_type) { @@ -5949,6 +6675,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->start = start; em->orig_start = orig_start; + em->mod_start = start; + em->mod_len = len; em->len = len; em->block_len = block_len; em->block_start = block_start; @@ -5990,16 +6718,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 len = bh_result->b_size; struct btrfs_trans_handle *trans; int unlock_bits = EXTENT_LOCKED; - int ret; + int ret = 0; - if (create) { - ret = btrfs_delalloc_reserve_space(inode, len); - if (ret) - return ret; + if (create) unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; - } else { + else len = min_t(u64, len, root->sectorsize); - } lockstart = start; lockend = start + len - 1; @@ -6011,14 +6735,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) return -ENOTBLK; - if (create) { - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_DELALLOC, NULL, - &cached_state, GFP_NOFS); - if (ret) - goto unlock_err; - } - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -6050,7 +6766,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (!create && (em->block_start == EXTENT_MAP_HOLE || test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { free_extent_map(em); - ret = 0; goto unlock_err; } @@ -6148,6 +6863,15 @@ unlock: */ if (start + len > i_size_read(inode)) i_size_write(inode, start + len); + + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockstart + len - 1, EXTENT_DELALLOC, NULL, + &cached_state, GFP_NOFS); + BUG_ON(ret); } /* @@ -6156,24 +6880,9 @@ unlock: * aren't using if there is any left over space. */ if (lockstart < lockend) { - if (create && len < lockend - lockstart) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockstart + len - 1, - unlock_bits | EXTENT_DEFRAG, 1, 0, - &cached_state, GFP_NOFS); - /* - * Beside unlock, we also need to cleanup reserved space - * for the left range by attaching EXTENT_DO_ACCOUNTING. - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, - lockstart + len, lockend, - unlock_bits | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS); - } else { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, unlock_bits, 1, 0, - &cached_state, GFP_NOFS); - } + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, unlock_bits, 1, 0, + &cached_state, GFP_NOFS); } else { free_extent_state(cached_state); } @@ -6183,9 +6892,6 @@ unlock: return 0; unlock_err: - if (create) - unlock_bits |= EXTENT_DO_ACCOUNTING; - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); return ret; @@ -6426,19 +7132,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int async_submit = 0; map_length = orig_bio->bi_size; - ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, + ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { bio_put(orig_bio); return -EIO; } - if (map_length >= orig_bio->bi_size) { bio = orig_bio; goto submit; } - async_submit = 1; + /* async crcs make it difficult to collect full stripe writes. */ + if (btrfs_get_alloc_profile(root, 1) & + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) + async_submit = 0; + else + async_submit = 1; + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); if (!bio) return -ENOMEM; @@ -6480,7 +7191,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_end_io = btrfs_end_dio_bio; map_length = orig_bio->bi_size; - ret = btrfs_map_block(root->fs_info, READ, + ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -6623,15 +7334,60 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + size_t count = 0; + int flags = 0; + bool wakeup = true; + bool relock = false; + ssize_t ret; if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, offset, nr_segs)) return 0; - return __blockdev_direct_IO(rw, iocb, inode, - BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, 0); + atomic_inc(&inode->i_dio_count); + smp_mb__after_atomic_inc(); + + if (rw & WRITE) { + count = iov_length(iov, nr_segs); + /* + * If the write DIO is beyond the EOF, we need update + * the isize, but it is protected by i_mutex. So we can + * not unlock the i_mutex at this case. + */ + if (offset + count <= inode->i_size) { + mutex_unlock(&inode->i_mutex); + relock = true; + } + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; + } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags))) { + inode_dio_done(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; + } + + ret = __blockdev_direct_IO(rw, iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, flags); + if (rw & WRITE) { + if (ret < 0 && ret != -EIOCBQUEUED) + btrfs_delalloc_release_space(inode, count); + else if (ret >= 0 && (size_t)ret < count) + btrfs_delalloc_release_space(inode, + count - (size_t)ret); + else + btrfs_delalloc_release_metadata(inode, 0); + } +out: + if (wakeup) + inode_dio_done(inode); + if (relock) + mutex_lock(&inode->i_mutex); + + return ret; } #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) @@ -6735,8 +7491,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) return; } lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, - page_offset(page)); + ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); if (ordered) { /* * IO on this page will never be started, so we need @@ -6791,7 +7546,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = fdentry(vma->vm_file)->d_inode; + struct inode *inode = file_inode(vma->vm_file); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; @@ -7216,8 +7971,9 @@ int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + /* the snap/subvol tree is on deleting */ if (btrfs_root_refs(&root->root_item) == 0 && - !btrfs_is_free_space_inode(inode)) + root != root->fs_info->tree_root) return 1; else return generic_drop_inode(inode); @@ -7299,40 +8055,22 @@ fail: static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { + u64 delalloc_bytes; struct inode *inode = dentry->d_inode; u32 blocksize = inode->i_sb->s_blocksize; generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; + + spin_lock(&BTRFS_I(inode)->lock); + delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; + spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + - ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; + ALIGN(delalloc_bytes, blocksize)) >> 9; return 0; } -/* - * If a file is moved, it will inherit the cow and compression flags of the new - * directory. - */ -static void fixup_inode_flags(struct inode *dir, struct inode *inode) -{ - struct btrfs_inode *b_dir = BTRFS_I(dir); - struct btrfs_inode *b_inode = BTRFS_I(inode); - - if (b_dir->flags & BTRFS_INODE_NODATACOW) - b_inode->flags |= BTRFS_INODE_NODATACOW; - else - b_inode->flags &= ~BTRFS_INODE_NODATACOW; - - if (b_dir->flags & BTRFS_INODE_COMPRESS) { - b_inode->flags |= BTRFS_INODE_COMPRESS; - b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; - } else { - b_inode->flags &= ~(BTRFS_INODE_COMPRESS | - BTRFS_INODE_NOCOMPRESS); - } -} - static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { @@ -7403,7 +8141,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. */ - trans = btrfs_start_transaction(root, 20); + trans = btrfs_start_transaction(root, 11); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; @@ -7498,8 +8236,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - fixup_inode_flags(new_dir, old_inode); - ret = btrfs_add_link(trans, new_dir, old_inode, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); @@ -7583,7 +8319,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); -again: + spin_lock(&root->fs_info->delalloc_lock); list_splice_init(&root->fs_info->delalloc_inodes, &splice); while (!list_empty(&splice)) { @@ -7593,8 +8329,11 @@ again: list_del_init(&binode->delalloc_inodes); inode = igrab(&binode->vfs_inode); - if (!inode) + if (!inode) { + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &binode->runtime_flags); continue; + } list_add_tail(&binode->delalloc_inodes, &root->fs_info->delalloc_inodes); @@ -7619,13 +8358,6 @@ again: btrfs_wait_and_free_delalloc_work(work); } - spin_lock(&root->fs_info->delalloc_lock); - if (!list_empty(&root->fs_info->delalloc_inodes)) { - spin_unlock(&root->fs_info->delalloc_lock); - goto again; - } - spin_unlock(&root->fs_info->delalloc_lock); - /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return @@ -7787,6 +8519,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, struct btrfs_key ins; u64 cur_offset = start; u64 i_size; + u64 cur_bytes; int ret = 0; bool own_trans = true; @@ -7801,8 +8534,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, - 0, *alloc_hint, &ins, 1); + cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); + cur_bytes = max(cur_bytes, min_size); + ret = btrfs_reserve_extent(trans, root, cur_bytes, + min_size, 0, *alloc_hint, &ins, 1); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5b22d45d3c6a..2c02310ff2d9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -42,12 +42,12 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/uuid.h> +#include <linux/btrfs.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "volumes.h" #include "locking.h" @@ -152,7 +152,7 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) static int btrfs_ioctl_getflags(struct file *file, void __user *arg) { - struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); + struct btrfs_inode *ip = BTRFS_I(file_inode(file)); unsigned int flags = btrfs_flags_to_ioctl(ip->flags); if (copy_to_user(arg, &flags, sizeof(flags))) @@ -177,7 +177,7 @@ static int check_flags(unsigned int flags) static int btrfs_ioctl_setflags(struct file *file, void __user *arg) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct btrfs_inode *ip = BTRFS_I(inode); struct btrfs_root *root = ip->root; struct btrfs_trans_handle *trans; @@ -310,7 +310,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) static int btrfs_ioctl_getversion(struct file *file, int __user *arg) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); return put_user(inode->i_generation, arg); } @@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return 0; } -static noinline int create_subvol(struct btrfs_root *root, +static noinline int create_subvol(struct inode *dir, struct dentry *dentry, char *name, int namelen, u64 *async_transid, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *new_root; - struct dentry *parent = dentry->d_parent; - struct inode *dir; + struct btrfs_block_rsv block_rsv; struct timespec cur_time = CURRENT_TIME; int ret; int err; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; + u64 qgroup_reserved; uuid_le new_uuid; ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); if (ret) return ret; - dir = parent->d_inode; - + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* - * 1 - inode item - * 2 - refs - * 1 - root item - * 2 - dir items + * The same as the snapshot creation, please see the comment + * of create_snapshot(). */ - trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) - return PTR_ERR(trans); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, + 7, &qgroup_reserved); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; - ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, - inherit ? *inherit : NULL); + ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit); if (ret) goto fail; @@ -515,22 +521,31 @@ static noinline int create_subvol(struct btrfs_root *root, BUG_ON(ret); - d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); + if (err) + err = btrfs_commit_transaction(trans, root); } else { err = btrfs_commit_transaction(trans, root); } if (err && !ret) ret = err; + + if (!ret) + d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); +out: + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); return ret; } -static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen, u64 *async_transid, - bool readonly, struct btrfs_qgroup_inherit **inherit) +static int create_snapshot(struct btrfs_root *root, struct inode *dir, + struct dentry *dentry, char *name, int namelen, + u64 *async_transid, bool readonly, + struct btrfs_qgroup_inherit *inherit) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; @@ -546,23 +561,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * 1 - parent dir inode + * 2 - dir entries + * 1 - root item + * 2 - root ref/backref + * 1 - root of snapshot + */ + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, 7, + &pending_snapshot->qgroup_reserved); + if (ret) + goto out; + pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; - if (inherit) { - pending_snapshot->inherit = *inherit; - *inherit = NULL; /* take responsibility to free it */ - } + pending_snapshot->dir = dir; + pending_snapshot->inherit = inherit; - trans = btrfs_start_transaction(root->fs_info->extent_root, 6); + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } - ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); - BUG_ON(ret); - spin_lock(&root->fs_info->trans_lock); list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); @@ -571,16 +594,14 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, *async_transid = trans->transid; ret = btrfs_commit_transaction_async(trans, root->fs_info->extent_root, 1); + if (ret) + ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); } - if (ret) { - /* cleanup_transaction has freed this for us */ - if (trans->aborted) - pending_snapshot = NULL; + if (ret) goto fail; - } ret = pending_snapshot->error; if (ret) @@ -599,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, d_instantiate(dentry, inode); ret = 0; fail: + btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, + pending_snapshot->qgroup_reserved); +out: kfree(pending_snapshot); return ret; } @@ -692,7 +717,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, u64 *async_transid, bool readonly, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; @@ -729,11 +754,11 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dentry, name, namelen, + error = create_snapshot(snap_src, dir, dentry, name, namelen, async_transid, readonly, inherit); } else { - error = create_subvol(BTRFS_I(dir)->root, dentry, - name, namelen, async_transid, inherit); + error = create_subvol(dir, dentry, name, namelen, + async_transid, inherit); } if (!error) fsnotify_mkdir(dir, dentry); @@ -815,7 +840,7 @@ static int find_new_extents(struct btrfs_root *root, while(1) { ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, newer_than); + path, newer_than); if (ret != 0) goto none; if (min_key.objectid != ino) @@ -1203,6 +1228,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!(inode->i_sb->s_flags & MS_ACTIVE)) break; + if (btrfs_defrag_cancelled(root->fs_info)) { + printk(KERN_DEBUG "btrfs: defrag_file cancelled\n"); + ret = -EAGAIN; + break; + } + if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, extent_thresh, &last_len, &skip, &defrag_end, range->flags & @@ -1317,7 +1348,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, u64 new_size; u64 old_size; u64 devid = 1; - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; @@ -1326,9 +1357,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, int ret = 0; int mod = 0; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1360,6 +1388,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); + if (!devid) { + ret = -EINVAL; + goto out_free; + } printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } @@ -1368,7 +1400,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); - ret = -EINVAL; + ret = -ENODEV; goto out_free; } @@ -1376,7 +1408,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, printk(KERN_INFO "btrfs: resizer unable to apply on " "readonly device %llu\n", (unsigned long long)devid); - ret = -EINVAL; + ret = -EPERM; goto out_free; } @@ -1398,7 +1430,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, } if (device->is_tgtdev_for_dev_replace) { - ret = -EINVAL; + ret = -EPERM; goto out_free; } @@ -1454,7 +1486,7 @@ out: static noinline int btrfs_ioctl_snap_create_transid(struct file *file, char *name, unsigned long fd, int subvol, u64 *transid, bool readonly, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { int namelen; int ret = 0; @@ -1486,8 +1518,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, goto out_drop_write; } - src_inode = src.file->f_path.dentry->d_inode; - if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { + src_inode = file_inode(src.file); + if (src_inode->i_sb != file_inode(file)->i_sb) { printk(KERN_INFO "btrfs: Snapshot src from " "another FS\n"); ret = -EINVAL; @@ -1563,7 +1595,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, vol_args->fd, subvol, ptr, - readonly, &inherit); + readonly, inherit); if (ret == 0 && ptr && copy_to_user(arg + @@ -1579,7 +1611,7 @@ out: static noinline int btrfs_ioctl_subvol_getflags(struct file *file, void __user *arg) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; u64 flags = 0; @@ -1601,7 +1633,7 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file, static noinline int btrfs_ioctl_subvol_setflags(struct file *file, void __user *arg) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 root_flags; @@ -1860,7 +1892,7 @@ static noinline int search_ioctl(struct inode *inode, path->keep_locks = 1; while(1) { - ret = btrfs_search_forward(root, &key, &max_key, path, 0, + ret = btrfs_search_forward(root, &key, &max_key, path, sk->min_transid); if (ret != 0) { if (ret > 0) @@ -1895,7 +1927,7 @@ static noinline int btrfs_ioctl_tree_search(struct file *file, if (IS_ERR(args)) return PTR_ERR(args); - inode = fdentry(file)->d_inode; + inode = file_inode(file); ret = search_ioctl(inode, args); if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; @@ -2005,7 +2037,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, if (IS_ERR(args)) return PTR_ERR(args); - inode = fdentry(file)->d_inode; + inode = file_inode(file); if (args->treeid == 0) args->treeid = BTRFS_I(inode)->root->root_key.objectid; @@ -2032,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; + struct btrfs_block_rsv block_rsv; + u64 qgroup_reserved; int namelen; int ret; int err = 0; @@ -2121,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out_up_write; + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * One for dir inode, two for dir entries, two for root + * ref/backref. + */ + err = btrfs_subvolume_reserve_metadata(root, &block_rsv, + 5, &qgroup_reserved); + if (err) + goto out_up_write; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto out_up_write; + goto out_release; } - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; ret = btrfs_unlink_subvol(trans, root, dir, dest->root_key.objectid, @@ -2156,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } } out_end_trans: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; ret = btrfs_end_transaction(trans, root); if (ret && !err) err = ret; inode->i_flags |= S_DEAD; +out_release: + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); out_up_write: up_write(&root->fs_info->subvol_sem); out_unlock: @@ -2168,6 +2217,12 @@ out_unlock: shrink_dcache_sb(root->fs_info->sb); btrfs_invalidate_inodes(dest); d_delete(dentry); + + /* the last ref */ + if (dest->cache_inode) { + iput(dest->cache_inode); + dest->cache_inode = NULL; + } } out_dput: dput(dentry); @@ -2181,7 +2236,7 @@ out: static int btrfs_ioctl_defrag(struct file *file, void __user *argp) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_defrag_range_args *range; int ret; @@ -2190,13 +2245,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (ret) return ret; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - mnt_drop_write_file(file); - return -EINVAL; - } - if (btrfs_root_readonly(root)) { ret = -EROFS; goto out; @@ -2208,10 +2256,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EPERM; goto out; } - ret = btrfs_defrag_root(root, 0); + ret = btrfs_defrag_root(root); if (ret) goto out; - ret = btrfs_defrag_root(root->fs_info->extent_root, 0); + ret = btrfs_defrag_root(root->fs_info->extent_root); break; case S_IFREG: if (!(file->f_mode & FMODE_WRITE)) { @@ -2241,7 +2289,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range->len = (u64)-1; } - ret = btrfs_defrag_file(fdentry(file)->d_inode, file, + ret = btrfs_defrag_file(file_inode(file), file, range, 0, 0); if (ret > 0) ret = 0; @@ -2251,7 +2299,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); mnt_drop_write_file(file); return ret; } @@ -2289,7 +2336,7 @@ out: static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_vol_args *vol_args; int ret; @@ -2412,7 +2459,7 @@ out: static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct fd src_file; struct inode *src; @@ -2458,7 +2505,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (src_file.file->f_path.mnt != file->f_path.mnt) goto out_fput; - src = src_file.file->f_dentry->d_inode; + src = file_inode(src_file.file); ret = -EINVAL; if (src == inode) @@ -2820,7 +2867,7 @@ static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) */ static long btrfs_ioctl_trans_start(struct file *file) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; int ret; @@ -2860,7 +2907,7 @@ out: static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *new_root; struct btrfs_dir_item *di; @@ -3084,7 +3131,7 @@ out: */ long btrfs_ioctl_trans_end(struct file *file) { - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -3108,7 +3155,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, u64 transid; int ret; - trans = btrfs_attach_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) return PTR_ERR(trans); @@ -3146,7 +3193,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, static long btrfs_ioctl_scrub(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_scrub_args *sa; int ret; @@ -3286,7 +3333,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) struct inode_fs_paths *ipath = NULL; struct btrfs_path *path; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_DAC_READ_SEARCH)) return -EPERM; path = btrfs_alloc_path(); @@ -3437,7 +3484,7 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, static long btrfs_ioctl_balance(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; @@ -3627,7 +3674,7 @@ out: static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_quota_ctl_args *sa; struct btrfs_trans_handle *trans = NULL; int ret; @@ -3686,7 +3733,7 @@ drop_write: static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_qgroup_assign_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -3733,7 +3780,7 @@ drop_write: static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_qgroup_create_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -3784,7 +3831,7 @@ drop_write: static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_qgroup_limit_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -3834,7 +3881,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, void __user *arg) { struct btrfs_ioctl_received_subvol_args *sa = NULL; - struct inode *inode = fdentry(file)->d_inode; + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; @@ -3911,10 +3958,69 @@ out: return ret; } +static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + const char *label = root->fs_info->super_copy->label; + size_t len = strnlen(label, BTRFS_LABEL_SIZE); + int ret; + + if (len == BTRFS_LABEL_SIZE) { + pr_warn("btrfs: label is too long, return the first %zu bytes\n", + --len); + } + + mutex_lock(&root->fs_info->volume_mutex); + ret = copy_to_user(arg, label, len); + mutex_unlock(&root->fs_info->volume_mutex); + + return ret ? -EFAULT : 0; +} + +static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_trans_handle *trans; + char label[BTRFS_LABEL_SIZE]; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(label, arg, sizeof(label))) + return -EFAULT; + + if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { + pr_err("btrfs: unable to set label with more than %d bytes\n", + BTRFS_LABEL_SIZE - 1); + return -EINVAL; + } + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + mutex_lock(&root->fs_info->volume_mutex); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + strcpy(super_block->label, label); + ret = btrfs_end_transaction(trans, root); + +out_unlock: + mutex_unlock(&root->fs_info->volume_mutex); + mnt_drop_write_file(file); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; void __user *argp = (void __user *)arg; switch (cmd) { @@ -4011,6 +4117,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_qgroup_limit(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(root, argp); + case BTRFS_IOC_GET_FSLABEL: + return btrfs_ioctl_get_fslabel(file, argp); + case BTRFS_IOC_SET_FSLABEL: + return btrfs_ioctl_set_fslabel(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h deleted file mode 100644 index dabca9cc8c2e..000000000000 --- a/fs/btrfs/ioctl.h +++ /dev/null @@ -1,502 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __IOCTL_ -#define __IOCTL_ -#include <linux/ioctl.h> - -#define BTRFS_IOCTL_MAGIC 0x94 -#define BTRFS_VOL_NAME_MAX 255 - -/* this should be 4k */ -#define BTRFS_PATH_NAME_MAX 4087 -struct btrfs_ioctl_vol_args { - __s64 fd; - char name[BTRFS_PATH_NAME_MAX + 1]; -}; - -#define BTRFS_DEVICE_PATH_NAME_MAX 1024 - -#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) -#define BTRFS_SUBVOL_RDONLY (1ULL << 1) -#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) -#define BTRFS_FSID_SIZE 16 -#define BTRFS_UUID_SIZE 16 - -#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0) - -struct btrfs_qgroup_limit { - __u64 flags; - __u64 max_rfer; - __u64 max_excl; - __u64 rsv_rfer; - __u64 rsv_excl; -}; - -struct btrfs_qgroup_inherit { - __u64 flags; - __u64 num_qgroups; - __u64 num_ref_copies; - __u64 num_excl_copies; - struct btrfs_qgroup_limit lim; - __u64 qgroups[0]; -}; - -struct btrfs_ioctl_qgroup_limit_args { - __u64 qgroupid; - struct btrfs_qgroup_limit lim; -}; - -#define BTRFS_SUBVOL_NAME_MAX 4039 -struct btrfs_ioctl_vol_args_v2 { - __s64 fd; - __u64 transid; - __u64 flags; - union { - struct { - __u64 size; - struct btrfs_qgroup_inherit __user *qgroup_inherit; - }; - __u64 unused[4]; - }; - char name[BTRFS_SUBVOL_NAME_MAX + 1]; -}; - -/* - * structure to report errors and progress to userspace, either as a - * result of a finished scrub, a canceled scrub or a progress inquiry - */ -struct btrfs_scrub_progress { - __u64 data_extents_scrubbed; /* # of data extents scrubbed */ - __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */ - __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */ - __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */ - __u64 read_errors; /* # of read errors encountered (EIO) */ - __u64 csum_errors; /* # of failed csum checks */ - __u64 verify_errors; /* # of occurences, where the metadata - * of a tree block did not match the - * expected values, like generation or - * logical */ - __u64 no_csum; /* # of 4k data block for which no csum - * is present, probably the result of - * data written with nodatasum */ - __u64 csum_discards; /* # of csum for which no data was found - * in the extent tree. */ - __u64 super_errors; /* # of bad super blocks encountered */ - __u64 malloc_errors; /* # of internal kmalloc errors. These - * will likely cause an incomplete - * scrub */ - __u64 uncorrectable_errors; /* # of errors where either no intact - * copy was found or the writeback - * failed */ - __u64 corrected_errors; /* # of errors corrected */ - __u64 last_physical; /* last physical address scrubbed. In - * case a scrub was aborted, this can - * be used to restart the scrub */ - __u64 unverified_errors; /* # of occurences where a read for a - * full (64k) bio failed, but the re- - * check succeeded for each 4k piece. - * Intermittent error. */ -}; - -#define BTRFS_SCRUB_READONLY 1 -struct btrfs_ioctl_scrub_args { - __u64 devid; /* in */ - __u64 start; /* in */ - __u64 end; /* in */ - __u64 flags; /* in */ - struct btrfs_scrub_progress progress; /* out */ - /* pad to 1k */ - __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; -}; - -#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 -#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 -struct btrfs_ioctl_dev_replace_start_params { - __u64 srcdevid; /* in, if 0, use srcdev_name instead */ - __u64 cont_reading_from_srcdev_mode; /* in, see #define - * above */ - __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ - __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ -}; - -#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 -#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1 -#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 -#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 -#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 -struct btrfs_ioctl_dev_replace_status_params { - __u64 replace_state; /* out, see #define above */ - __u64 progress_1000; /* out, 0 <= x <= 1000 */ - __u64 time_started; /* out, seconds since 1-Jan-1970 */ - __u64 time_stopped; /* out, seconds since 1-Jan-1970 */ - __u64 num_write_errors; /* out */ - __u64 num_uncorrectable_read_errors; /* out */ -}; - -#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0 -#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1 -#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2 -#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 -#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 -#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 -struct btrfs_ioctl_dev_replace_args { - __u64 cmd; /* in */ - __u64 result; /* out */ - - union { - struct btrfs_ioctl_dev_replace_start_params start; - struct btrfs_ioctl_dev_replace_status_params status; - }; /* in/out */ - - __u64 spare[64]; -}; - -struct btrfs_ioctl_dev_info_args { - __u64 devid; /* in/out */ - __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ - __u64 bytes_used; /* out */ - __u64 total_bytes; /* out */ - __u64 unused[379]; /* pad to 4k */ - __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ -}; - -struct btrfs_ioctl_fs_info_args { - __u64 max_id; /* out */ - __u64 num_devices; /* out */ - __u8 fsid[BTRFS_FSID_SIZE]; /* out */ - __u64 reserved[124]; /* pad to 1k */ -}; - -/* balance control ioctl modes */ -#define BTRFS_BALANCE_CTL_PAUSE 1 -#define BTRFS_BALANCE_CTL_CANCEL 2 - -/* - * this is packed, because it should be exactly the same as its disk - * byte order counterpart (struct btrfs_disk_balance_args) - */ -struct btrfs_balance_args { - __u64 profiles; - __u64 usage; - __u64 devid; - __u64 pstart; - __u64 pend; - __u64 vstart; - __u64 vend; - - __u64 target; - - __u64 flags; - - __u64 unused[8]; -} __attribute__ ((__packed__)); - -/* report balance progress to userspace */ -struct btrfs_balance_progress { - __u64 expected; /* estimated # of chunks that will be - * relocated to fulfill the request */ - __u64 considered; /* # of chunks we have considered so far */ - __u64 completed; /* # of chunks relocated so far */ -}; - -#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) -#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) -#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) - -struct btrfs_ioctl_balance_args { - __u64 flags; /* in/out */ - __u64 state; /* out */ - - struct btrfs_balance_args data; /* in/out */ - struct btrfs_balance_args meta; /* in/out */ - struct btrfs_balance_args sys; /* in/out */ - - struct btrfs_balance_progress stat; /* out */ - - __u64 unused[72]; /* pad to 1k */ -}; - -#define BTRFS_INO_LOOKUP_PATH_MAX 4080 -struct btrfs_ioctl_ino_lookup_args { - __u64 treeid; - __u64 objectid; - char name[BTRFS_INO_LOOKUP_PATH_MAX]; -}; - -struct btrfs_ioctl_search_key { - /* which root are we searching. 0 is the tree of tree roots */ - __u64 tree_id; - - /* keys returned will be >= min and <= max */ - __u64 min_objectid; - __u64 max_objectid; - - /* keys returned will be >= min and <= max */ - __u64 min_offset; - __u64 max_offset; - - /* max and min transids to search for */ - __u64 min_transid; - __u64 max_transid; - - /* keys returned will be >= min and <= max */ - __u32 min_type; - __u32 max_type; - - /* - * how many items did userland ask for, and how many are we - * returning - */ - __u32 nr_items; - - /* align to 64 bits */ - __u32 unused; - - /* some extra for later */ - __u64 unused1; - __u64 unused2; - __u64 unused3; - __u64 unused4; -}; - -struct btrfs_ioctl_search_header { - __u64 transid; - __u64 objectid; - __u64 offset; - __u32 type; - __u32 len; -}; - -#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) -/* - * the buf is an array of search headers where - * each header is followed by the actual item - * the type field is expanded to 32 bits for alignment - */ -struct btrfs_ioctl_search_args { - struct btrfs_ioctl_search_key key; - char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; -}; - -struct btrfs_ioctl_clone_range_args { - __s64 src_fd; - __u64 src_offset, src_length; - __u64 dest_offset; -}; - -/* flags for the defrag range ioctl */ -#define BTRFS_DEFRAG_RANGE_COMPRESS 1 -#define BTRFS_DEFRAG_RANGE_START_IO 2 - -struct btrfs_ioctl_space_info { - __u64 flags; - __u64 total_bytes; - __u64 used_bytes; -}; - -struct btrfs_ioctl_space_args { - __u64 space_slots; - __u64 total_spaces; - struct btrfs_ioctl_space_info spaces[0]; -}; - -struct btrfs_data_container { - __u32 bytes_left; /* out -- bytes not needed to deliver output */ - __u32 bytes_missing; /* out -- additional bytes needed for result */ - __u32 elem_cnt; /* out */ - __u32 elem_missed; /* out */ - __u64 val[0]; /* out */ -}; - -struct btrfs_ioctl_ino_path_args { - __u64 inum; /* in */ - __u64 size; /* in */ - __u64 reserved[4]; - /* struct btrfs_data_container *fspath; out */ - __u64 fspath; /* out */ -}; - -struct btrfs_ioctl_logical_ino_args { - __u64 logical; /* in */ - __u64 size; /* in */ - __u64 reserved[4]; - /* struct btrfs_data_container *inodes; out */ - __u64 inodes; -}; - -enum btrfs_dev_stat_values { - /* disk I/O failure stats */ - BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */ - BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */ - BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */ - - /* stats for indirect indications for I/O failures */ - BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or - * contents is illegal: this is an - * indication that the block was damaged - * during read or write, or written to - * wrong location or read from wrong - * location */ - BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not - * been written */ - - BTRFS_DEV_STAT_VALUES_MAX -}; - -/* Reset statistics after reading; needs SYS_ADMIN capability */ -#define BTRFS_DEV_STATS_RESET (1ULL << 0) - -struct btrfs_ioctl_get_dev_stats { - __u64 devid; /* in */ - __u64 nr_items; /* in/out */ - __u64 flags; /* in/out */ - - /* out values: */ - __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; - - __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ -}; - -#define BTRFS_QUOTA_CTL_ENABLE 1 -#define BTRFS_QUOTA_CTL_DISABLE 2 -#define BTRFS_QUOTA_CTL_RESCAN 3 -struct btrfs_ioctl_quota_ctl_args { - __u64 cmd; - __u64 status; -}; - -struct btrfs_ioctl_qgroup_assign_args { - __u64 assign; - __u64 src; - __u64 dst; -}; - -struct btrfs_ioctl_qgroup_create_args { - __u64 create; - __u64 qgroupid; -}; -struct btrfs_ioctl_timespec { - __u64 sec; - __u32 nsec; -}; - -struct btrfs_ioctl_received_subvol_args { - char uuid[BTRFS_UUID_SIZE]; /* in */ - __u64 stransid; /* in */ - __u64 rtransid; /* out */ - struct btrfs_ioctl_timespec stime; /* in */ - struct btrfs_ioctl_timespec rtime; /* out */ - __u64 flags; /* in */ - __u64 reserved[16]; /* in */ -}; - -struct btrfs_ioctl_send_args { - __s64 send_fd; /* in */ - __u64 clone_sources_count; /* in */ - __u64 __user *clone_sources; /* in */ - __u64 parent_root; /* in */ - __u64 flags; /* in */ - __u64 reserved[4]; /* in */ -}; - -#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ - struct btrfs_ioctl_vol_args) -/* trans start and trans end are dangerous, and only for - * use by applications that know how to avoid the - * resulting deadlocks - */ -#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) -#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) -#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) - -#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) -#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ - struct btrfs_ioctl_vol_args) - -#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ - struct btrfs_ioctl_clone_range_args) - -#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \ - struct btrfs_ioctl_defrag_range_args) -#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \ - struct btrfs_ioctl_search_args) -#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \ - struct btrfs_ioctl_ino_lookup_args) -#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) -#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ - struct btrfs_ioctl_space_args) -#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) -#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) -#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ - struct btrfs_ioctl_vol_args_v2) -#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \ - struct btrfs_ioctl_vol_args_v2) -#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64) -#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) -#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ - struct btrfs_ioctl_scrub_args) -#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28) -#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \ - struct btrfs_ioctl_scrub_args) -#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \ - struct btrfs_ioctl_dev_info_args) -#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ - struct btrfs_ioctl_fs_info_args) -#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ - struct btrfs_ioctl_balance_args) -#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) -#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ - struct btrfs_ioctl_balance_args) -#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ - struct btrfs_ioctl_ino_path_args) -#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ - struct btrfs_ioctl_ino_path_args) -#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \ - struct btrfs_ioctl_received_subvol_args) -#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args) -#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \ - struct btrfs_ioctl_quota_ctl_args) -#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \ - struct btrfs_ioctl_qgroup_assign_args) -#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \ - struct btrfs_ioctl_qgroup_create_args) -#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \ - struct btrfs_ioctl_qgroup_limit_args) -#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ - struct btrfs_ioctl_get_dev_stats) -#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ - struct btrfs_ioctl_dev_replace_args) - -#endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 2a1762c66041..e95df435d897 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -113,11 +113,10 @@ again: read_unlock(&eb->lock); return; } - read_unlock(&eb->lock); - wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); - read_lock(&eb->lock); if (atomic_read(&eb->blocking_writers)) { read_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); goto again; } atomic_inc(&eb->read_locks); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index ca52681e5f40..b81e0e9a4894 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -26,7 +26,6 @@ void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); -int btrfs_try_spin_lock(struct extent_buffer *eb); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f10731297040..005c45db699e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && + !(type == BTRFS_ORDERED_NOCOW)) + entry->csum_bytes_left = disk_len; entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = igrab(inode); @@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); + INIT_LIST_HEAD(&entry->log_list); trace_btrfs_ordered_extent_add(inode, entry); @@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode, tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); + WARN_ON(entry->csum_bytes_left < sum->len); + entry->csum_bytes_left -= sum->len; + if (entry->csum_bytes_left == 0) + wake_up(&entry->wait); spin_unlock_irq(&tree->lock); } @@ -405,6 +413,66 @@ out: return ret == 0; } +/* Needs to either be called under a log transaction or the log_mutex */ +void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) +{ + struct btrfs_ordered_inode_tree *tree; + struct btrfs_ordered_extent *ordered; + struct rb_node *n; + int index = log->log_transid % 2; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + spin_lock(&log->log_extents_lock[index]); + if (list_empty(&ordered->log_list)) { + list_add_tail(&ordered->log_list, &log->logged_list[index]); + atomic_inc(&ordered->refs); + } + spin_unlock(&log->log_extents_lock[index]); + } + spin_unlock_irq(&tree->lock); +} + +void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) +{ + struct btrfs_ordered_extent *ordered; + int index = transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + while (!list_empty(&log->logged_list[index])) { + ordered = list_first_entry(&log->logged_list[index], + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + spin_unlock_irq(&log->log_extents_lock[index]); + wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, + &ordered->flags)); + btrfs_put_ordered_extent(ordered); + spin_lock_irq(&log->log_extents_lock[index]); + } + spin_unlock_irq(&log->log_extents_lock[index]); +} + +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid) +{ + struct btrfs_ordered_extent *ordered; + int index = transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + while (!list_empty(&log->logged_list[index])) { + ordered = list_first_entry(&log->logged_list[index], + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + spin_unlock_irq(&log->log_extents_lock[index]); + btrfs_put_ordered_extent(ordered); + spin_lock_irq(&log->log_extents_lock[index]); + } + spin_unlock_irq(&log->log_extents_lock[index]); +} + /* * used to drop a reference on an ordered extent. This will free * the extent if the last reference is dropped @@ -489,6 +557,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); + mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); list_splice_init(&root->fs_info->ordered_extents, &splice); while (!list_empty(&splice)) { @@ -532,6 +601,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) cond_resched(); } + mutex_unlock(&root->fs_info->ordered_operations_mutex); } /* @@ -544,10 +614,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) * extra check to make sure the ordered operation list really is empty * before we return */ -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int wait) { struct btrfs_inode *btrfs_inode; struct inode *inode; + struct btrfs_transaction *cur_trans = trans->transaction; struct list_head splice; struct list_head works; struct btrfs_delalloc_work *work, *next; @@ -558,14 +630,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); -again: - list_splice_init(&root->fs_info->ordered_operations, &splice); - + list_splice_init(&cur_trans->ordered_operations, &splice); while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); - inode = &btrfs_inode->vfs_inode; list_del_init(&btrfs_inode->ordered_operations); @@ -574,24 +642,22 @@ again: * the inode may be getting freed (in sys_unlink path). */ inode = igrab(inode); - - if (!wait && inode) { - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &root->fs_info->ordered_operations); - } - if (!inode) continue; + + if (!wait) + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &cur_trans->ordered_operations); spin_unlock(&root->fs_info->ordered_extent_lock); work = btrfs_alloc_delalloc_work(inode, wait, 1); if (!work) { + spin_lock(&root->fs_info->ordered_extent_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) list_add_tail(&btrfs_inode->ordered_operations, &splice); - spin_lock(&root->fs_info->ordered_extent_lock); list_splice_tail(&splice, - &root->fs_info->ordered_operations); + &cur_trans->ordered_operations); spin_unlock(&root->fs_info->ordered_extent_lock); ret = -ENOMEM; goto out; @@ -603,9 +669,6 @@ again: cond_resched(); spin_lock(&root->fs_info->ordered_extent_lock); } - if (wait && !list_empty(&root->fs_info->ordered_operations)) - goto again; - spin_unlock(&root->fs_info->ordered_extent_lock); out: list_for_each_entry_safe(work, next, &works, list) { @@ -836,9 +899,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, * if the disk i_size is already at the inode->i_size, or * this ordered extent is inside the disk i_size, we're done */ - if (disk_i_size == i_size || offset <= disk_i_size) { + if (disk_i_size == i_size) + goto out; + + /* + * We still need to update disk_i_size if outstanding_isize is greater + * than disk_i_size. + */ + if (offset <= disk_i_size && + (!ordered || ordered->outstanding_isize <= disk_i_size)) goto out; - } /* * walk backward from this ordered extent to disk_i_size. @@ -870,7 +940,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, break; if (test->file_offset >= i_size) break; - if (test->file_offset >= disk_i_size) { + if (entry_end(test) > disk_i_size) { /* * we don't update disk_i_size now, so record this * undealt i_size. Or we will not know the real @@ -967,6 +1037,7 @@ out: void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { + struct btrfs_transaction *cur_trans = trans->transaction; u64 last_mod; last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); @@ -981,7 +1052,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->ordered_extent_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) { list_add_tail(&BTRFS_I(inode)->ordered_operations, - &root->fs_info->ordered_operations); + &cur_trans->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f29d4bf5fbe7..8eadfe406cdd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -79,6 +79,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent * has done its due diligence in updating * the isize. */ +#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered + ordered extent */ struct btrfs_ordered_extent { /* logical offset in the file */ @@ -96,6 +98,9 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; + /* number of bytes that still need csumming */ + u64 csum_bytes_left; + /* * the end of the ordered extent which is behind it but * didn't update disk_i_size. Please see the comment of @@ -118,6 +123,9 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; + /* If we need to wait on this to be done */ + struct list_head log_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); +void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); +void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); void ordered_data_exit(void); #endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 50d95fd190a5..920957ecb27e 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_dev_extent_chunk_offset(l, dev_extent), (unsigned long long) btrfs_dev_extent_length(l, dev_extent)); + break; case BTRFS_DEV_STATS_KEY: printk(KERN_INFO "\t\tdevice stats\n"); break; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a5c856234323..b44124dd2370 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -23,13 +23,13 @@ #include <linux/rbtree.h> #include <linux/slab.h> #include <linux/workqueue.h> +#include <linux/btrfs.h> #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "locking.h" #include "ulist.h" -#include "ioctl.h" #include "backref.h" /* TODO XXX FIXME @@ -620,7 +620,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, key.offset = qgroupid; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -661,7 +663,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, key.offset = qgroup->qgroupid; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -702,7 +706,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans, key.offset = 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -732,33 +738,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, { struct btrfs_path *path; struct btrfs_key key; + struct extent_buffer *leaf = NULL; int ret; - - if (!root) - return -EINVAL; + int nr = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - while (1) { - key.objectid = 0; - key.offset = 0; - key.type = 0; + path->leave_spinning = 1; - path->leave_spinning = 1; + key.objectid = 0; + key.offset = 0; + key.type = 0; + + while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) { - if (path->slots[0] == 0) - break; - path->slots[0]--; - } else if (ret < 0) { + if (ret < 0) + goto out; + leaf = path->nodes[0]; + nr = btrfs_header_nritems(leaf); + if (!nr) break; - } - - ret = btrfs_del_item(trans, root, path); + /* + * delete the leaf one by one + * since the whole tree is going + * to be deleted. + */ + path->slots[0] = 0; + ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) goto out; + btrfs_release_path(path); } ret = 0; @@ -847,6 +858,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, int ret = 0; spin_lock(&fs_info->qgroup_lock); + if (!fs_info->quota_root) { + spin_unlock(&fs_info->qgroup_lock); + return 0; + } fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; quota_root = fs_info->quota_root; @@ -1138,7 +1153,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, sgn > 0 ? node->seq - 1 : node->seq, &roots); if (ret < 0) - goto out; + return ret; spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -1260,7 +1275,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = 0; unlock: spin_unlock(&fs_info->qgroup_lock); -out: ulist_free(roots); ulist_free(tmp); @@ -1510,21 +1524,23 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qg->reserved + qg->rfer + num_bytes > - qg->max_rfer) + qg->max_rfer) { ret = -EDQUOT; + goto out; + } if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && qg->reserved + qg->excl + num_bytes > - qg->max_excl) + qg->max_excl) { ret = -EDQUOT; + goto out; + } list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(ulist, glist->group->qgroupid, (uintptr_t)glist->group, GFP_ATOMIC); } } - if (ret) - goto out; /* * no limits exceeded, now record the reservation into all qgroups diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c new file mode 100644 index 000000000000..9a79fb790adb --- /dev/null +++ b/fs/btrfs/raid56.c @@ -0,0 +1,2100 @@ +/* + * Copyright (C) 2012 Fusion-io All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/iocontext.h> +#include <linux/capability.h> +#include <linux/ratelimit.h> +#include <linux/kthread.h> +#include <linux/raid/pq.h> +#include <linux/hash.h> +#include <linux/list_sort.h> +#include <linux/raid/xor.h> +#include <linux/vmalloc.h> +#include <asm/div64.h> +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" + +/* set when additional merges to this rbio are not allowed */ +#define RBIO_RMW_LOCKED_BIT 1 + +/* + * set when this rbio is sitting in the hash, but it is just a cache + * of past RMW + */ +#define RBIO_CACHE_BIT 2 + +/* + * set when it is safe to trust the stripe_pages for caching + */ +#define RBIO_CACHE_READY_BIT 3 + + +#define RBIO_CACHE_SIZE 1024 + +struct btrfs_raid_bio { + struct btrfs_fs_info *fs_info; + struct btrfs_bio *bbio; + + /* + * logical block numbers for the start of each stripe + * The last one or two are p/q. These are sorted, + * so raid_map[0] is the start of our full stripe + */ + u64 *raid_map; + + /* while we're doing rmw on a stripe + * we put it into a hash table so we can + * lock the stripe and merge more rbios + * into it. + */ + struct list_head hash_list; + + /* + * LRU list for the stripe cache + */ + struct list_head stripe_cache; + + /* + * for scheduling work in the helper threads + */ + struct btrfs_work work; + + /* + * bio list and bio_list_lock are used + * to add more bios into the stripe + * in hopes of avoiding the full rmw + */ + struct bio_list bio_list; + spinlock_t bio_list_lock; + + /* also protected by the bio_list_lock, the + * plug list is used by the plugging code + * to collect partial bios while plugged. The + * stripe locking code also uses it to hand off + * the stripe lock to the next pending IO + */ + struct list_head plug_list; + + /* + * flags that tell us if it is safe to + * merge with this bio + */ + unsigned long flags; + + /* size of each individual stripe on disk */ + int stripe_len; + + /* number of data stripes (no p/q) */ + int nr_data; + + /* + * set if we're doing a parity rebuild + * for a read from higher up, which is handled + * differently from a parity rebuild as part of + * rmw + */ + int read_rebuild; + + /* first bad stripe */ + int faila; + + /* second bad stripe (for raid6 use) */ + int failb; + + /* + * number of pages needed to represent the full + * stripe + */ + int nr_pages; + + /* + * size of all the bios in the bio_list. This + * helps us decide if the rbio maps to a full + * stripe or not + */ + int bio_list_bytes; + + atomic_t refs; + + /* + * these are two arrays of pointers. We allocate the + * rbio big enough to hold them both and setup their + * locations when the rbio is allocated + */ + + /* pointers to pages that we allocated for + * reading/writing stripes directly from the disk (including P/Q) + */ + struct page **stripe_pages; + + /* + * pointers to the pages in the bio_list. Stored + * here for faster lookup + */ + struct page **bio_pages; +}; + +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); +static noinline void finish_rmw(struct btrfs_raid_bio *rbio); +static void rmw_work(struct btrfs_work *work); +static void read_rebuild_work(struct btrfs_work *work); +static void async_rmw_stripe(struct btrfs_raid_bio *rbio); +static void async_read_rebuild(struct btrfs_raid_bio *rbio); +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); +static void __free_raid_bio(struct btrfs_raid_bio *rbio); +static void index_rbio_pages(struct btrfs_raid_bio *rbio); +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); + +/* + * the stripe hash table is used for locking, and to collect + * bios in hopes of making a full stripe + */ +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash_table *x; + struct btrfs_stripe_hash *cur; + struct btrfs_stripe_hash *h; + int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; + int i; + int table_size; + + if (info->stripe_hash_table) + return 0; + + /* + * The table is large, starting with order 4 and can go as high as + * order 7 in case lock debugging is turned on. + * + * Try harder to allocate and fallback to vmalloc to lower the chance + * of a failing mount. + */ + table_size = sizeof(*table) + sizeof(*h) * num_entries; + table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!table) { + table = vzalloc(table_size); + if (!table) + return -ENOMEM; + } + + spin_lock_init(&table->cache_lock); + INIT_LIST_HEAD(&table->stripe_cache); + + h = table->table; + + for (i = 0; i < num_entries; i++) { + cur = h + i; + INIT_LIST_HEAD(&cur->hash_list); + spin_lock_init(&cur->lock); + init_waitqueue_head(&cur->wait); + } + + x = cmpxchg(&info->stripe_hash_table, NULL, table); + if (x) { + if (is_vmalloc_addr(x)) + vfree(x); + else + kfree(x); + } + return 0; +} + +/* + * caching an rbio means to copy anything from the + * bio_pages array into the stripe_pages array. We + * use the page uptodate bit in the stripe cache array + * to indicate if it has valid data + * + * once the caching is done, we set the cache ready + * bit. + */ +static void cache_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int i; + char *s; + char *d; + int ret; + + ret = alloc_rbio_pages(rbio); + if (ret) + return; + + for (i = 0; i < rbio->nr_pages; i++) { + if (!rbio->bio_pages[i]) + continue; + + s = kmap(rbio->bio_pages[i]); + d = kmap(rbio->stripe_pages[i]); + + memcpy(d, s, PAGE_CACHE_SIZE); + + kunmap(rbio->bio_pages[i]); + kunmap(rbio->stripe_pages[i]); + SetPageUptodate(rbio->stripe_pages[i]); + } + set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); +} + +/* + * we hash on the first logical address of the stripe + */ +static int rbio_bucket(struct btrfs_raid_bio *rbio) +{ + u64 num = rbio->raid_map[0]; + + /* + * we shift down quite a bit. We're using byte + * addressing, and most of the lower bits are zeros. + * This tends to upset hash_64, and it consistently + * returns just one or two different values. + * + * shifting off the lower bits fixes things. + */ + return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); +} + +/* + * stealing an rbio means taking all the uptodate pages from the stripe + * array in the source rbio and putting them into the destination rbio + */ +static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) +{ + int i; + struct page *s; + struct page *d; + + if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) + return; + + for (i = 0; i < dest->nr_pages; i++) { + s = src->stripe_pages[i]; + if (!s || !PageUptodate(s)) { + continue; + } + + d = dest->stripe_pages[i]; + if (d) + __free_page(d); + + dest->stripe_pages[i] = s; + src->stripe_pages[i] = NULL; + } +} + +/* + * merging means we take the bio_list from the victim and + * splice it into the destination. The victim should + * be discarded afterwards. + * + * must be called with dest->rbio_list_lock held + */ +static void merge_rbio(struct btrfs_raid_bio *dest, + struct btrfs_raid_bio *victim) +{ + bio_list_merge(&dest->bio_list, &victim->bio_list); + dest->bio_list_bytes += victim->bio_list_bytes; + bio_list_init(&victim->bio_list); +} + +/* + * used to prune items that are in the cache. The caller + * must hold the hash table lock. + */ +static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + int bucket = rbio_bucket(rbio); + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash *h; + int freeit = 0; + + /* + * check the bit again under the hash table lock. + */ + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + h = table->table + bucket; + + /* hold the lock for the bucket because we may be + * removing it from the hash table + */ + spin_lock(&h->lock); + + /* + * hold the lock for the bio list because we need + * to make sure the bio list is empty + */ + spin_lock(&rbio->bio_list_lock); + + if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { + list_del_init(&rbio->stripe_cache); + table->cache_size -= 1; + freeit = 1; + + /* if the bio list isn't empty, this rbio is + * still involved in an IO. We take it out + * of the cache list, and drop the ref that + * was held for the list. + * + * If the bio_list was empty, we also remove + * the rbio from the hash_table, and drop + * the corresponding ref + */ + if (bio_list_empty(&rbio->bio_list)) { + if (!list_empty(&rbio->hash_list)) { + list_del_init(&rbio->hash_list); + atomic_dec(&rbio->refs); + BUG_ON(!list_empty(&rbio->plug_list)); + } + } + } + + spin_unlock(&rbio->bio_list_lock); + spin_unlock(&h->lock); + + if (freeit) + __free_raid_bio(rbio); +} + +/* + * prune a given rbio from the cache + */ +static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + __remove_rbio_from_cache(rbio); + spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove everything in the cache + */ +void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + struct btrfs_raid_bio *rbio; + + table = info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + while (!list_empty(&table->stripe_cache)) { + rbio = list_entry(table->stripe_cache.next, + struct btrfs_raid_bio, + stripe_cache); + __remove_rbio_from_cache(rbio); + } + spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove all cached entries and free the hash table + * used by unmount + */ +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) +{ + if (!info->stripe_hash_table) + return; + btrfs_clear_rbio_cache(info); + if (is_vmalloc_addr(info->stripe_hash_table)) + vfree(info->stripe_hash_table); + else + kfree(info->stripe_hash_table); + info->stripe_hash_table = NULL; +} + +/* + * insert an rbio into the stripe cache. It + * must have already been prepared by calling + * cache_rbio_pages + * + * If this rbio was already cached, it gets + * moved to the front of the lru. + * + * If the size of the rbio cache is too big, we + * prune an item. + */ +static void cache_rbio(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + + if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&rbio->bio_list_lock); + + /* bump our ref if we were not in the list before */ + if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) + atomic_inc(&rbio->refs); + + if (!list_empty(&rbio->stripe_cache)){ + list_move(&rbio->stripe_cache, &table->stripe_cache); + } else { + list_add(&rbio->stripe_cache, &table->stripe_cache); + table->cache_size += 1; + } + + spin_unlock(&rbio->bio_list_lock); + + if (table->cache_size > RBIO_CACHE_SIZE) { + struct btrfs_raid_bio *found; + + found = list_entry(table->stripe_cache.prev, + struct btrfs_raid_bio, + stripe_cache); + + if (found != rbio) + __remove_rbio_from_cache(found); + } + + spin_unlock_irqrestore(&table->cache_lock, flags); + return; +} + +/* + * helper function to run the xor_blocks api. It is only + * able to do MAX_XOR_BLOCKS at a time, so we need to + * loop through. + */ +static void run_xor(void **pages, int src_cnt, ssize_t len) +{ + int src_off = 0; + int xor_src_cnt = 0; + void *dest = pages[src_cnt]; + + while(src_cnt > 0) { + xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); + xor_blocks(xor_src_cnt, len, dest, pages + src_off); + + src_cnt -= xor_src_cnt; + src_off += xor_src_cnt; + } +} + +/* + * returns true if the bio list inside this rbio + * covers an entire stripe (no rmw required). + * Must be called with the bio list lock held, or + * at a time when you know it is impossible to add + * new bios into the list + */ +static int __rbio_is_full(struct btrfs_raid_bio *rbio) +{ + unsigned long size = rbio->bio_list_bytes; + int ret = 1; + + if (size != rbio->nr_data * rbio->stripe_len) + ret = 0; + + BUG_ON(size > rbio->nr_data * rbio->stripe_len); + return ret; +} + +static int rbio_is_full(struct btrfs_raid_bio *rbio) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&rbio->bio_list_lock, flags); + ret = __rbio_is_full(rbio); + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + return ret; +} + +/* + * returns 1 if it is safe to merge two rbios together. + * The merging is safe if the two rbios correspond to + * the same stripe and if they are both going in the same + * direction (read vs write), and if neither one is + * locked for final IO + * + * The caller is responsible for locking such that + * rmw_locked is safe to test + */ +static int rbio_can_merge(struct btrfs_raid_bio *last, + struct btrfs_raid_bio *cur) +{ + if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || + test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) + return 0; + + /* + * we can't merge with cached rbios, since the + * idea is that when we merge the destination + * rbio is going to run our IO for us. We can + * steal from cached rbio's though, other functions + * handle that. + */ + if (test_bit(RBIO_CACHE_BIT, &last->flags) || + test_bit(RBIO_CACHE_BIT, &cur->flags)) + return 0; + + if (last->raid_map[0] != + cur->raid_map[0]) + return 0; + + /* reads can't merge with writes */ + if (last->read_rebuild != + cur->read_rebuild) { + return 0; + } + + return 1; +} + +/* + * helper to index into the pstripe + */ +static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) +{ + index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + return rbio->stripe_pages[index]; +} + +/* + * helper to index into the qstripe, returns null + * if there is no qstripe + */ +static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) +{ + if (rbio->nr_data + 1 == rbio->bbio->num_stripes) + return NULL; + + index += ((rbio->nr_data + 1) * rbio->stripe_len) >> + PAGE_CACHE_SHIFT; + return rbio->stripe_pages[index]; +} + +/* + * The first stripe in the table for a logical address + * has the lock. rbios are added in one of three ways: + * + * 1) Nobody has the stripe locked yet. The rbio is given + * the lock and 0 is returned. The caller must start the IO + * themselves. + * + * 2) Someone has the stripe locked, but we're able to merge + * with the lock owner. The rbio is freed and the IO will + * start automatically along with the existing rbio. 1 is returned. + * + * 3) Someone has the stripe locked, but we're not able to merge. + * The rbio is added to the lock owner's plug list, or merged into + * an rbio already on the plug list. When the lock owner unlocks, + * the next rbio on the list is run and the IO is started automatically. + * 1 is returned + * + * If we return 0, the caller still owns the rbio and must continue with + * IO submission. If we return 1, the caller must assume the rbio has + * already been freed. + */ +static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) +{ + int bucket = rbio_bucket(rbio); + struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *pending; + unsigned long flags; + DEFINE_WAIT(wait); + struct btrfs_raid_bio *freeit = NULL; + struct btrfs_raid_bio *cache_drop = NULL; + int ret = 0; + int walk = 0; + + spin_lock_irqsave(&h->lock, flags); + list_for_each_entry(cur, &h->hash_list, hash_list) { + walk++; + if (cur->raid_map[0] == rbio->raid_map[0]) { + spin_lock(&cur->bio_list_lock); + + /* can we steal this cached rbio's pages? */ + if (bio_list_empty(&cur->bio_list) && + list_empty(&cur->plug_list) && + test_bit(RBIO_CACHE_BIT, &cur->flags) && + !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { + list_del_init(&cur->hash_list); + atomic_dec(&cur->refs); + + steal_rbio(cur, rbio); + cache_drop = cur; + spin_unlock(&cur->bio_list_lock); + + goto lockit; + } + + /* can we merge into the lock owner? */ + if (rbio_can_merge(cur, rbio)) { + merge_rbio(cur, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + + + /* + * we couldn't merge with the running + * rbio, see if we can merge with the + * pending ones. We don't have to + * check for rmw_locked because there + * is no way they are inside finish_rmw + * right now + */ + list_for_each_entry(pending, &cur->plug_list, + plug_list) { + if (rbio_can_merge(pending, rbio)) { + merge_rbio(pending, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + } + + /* no merging, put us on the tail of the plug list, + * our rbio will be started with the currently + * running rbio unlocks + */ + list_add_tail(&rbio->plug_list, &cur->plug_list); + spin_unlock(&cur->bio_list_lock); + ret = 1; + goto out; + } + } +lockit: + atomic_inc(&rbio->refs); + list_add(&rbio->hash_list, &h->hash_list); +out: + spin_unlock_irqrestore(&h->lock, flags); + if (cache_drop) + remove_rbio_from_cache(cache_drop); + if (freeit) + __free_raid_bio(freeit); + return ret; +} + +/* + * called as rmw or parity rebuild is completed. If the plug list has more + * rbios waiting for this stripe, the next one on the list will be started + */ +static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) +{ + int bucket; + struct btrfs_stripe_hash *h; + unsigned long flags; + int keep_cache = 0; + + bucket = rbio_bucket(rbio); + h = rbio->fs_info->stripe_hash_table->table + bucket; + + if (list_empty(&rbio->plug_list)) + cache_rbio(rbio); + + spin_lock_irqsave(&h->lock, flags); + spin_lock(&rbio->bio_list_lock); + + if (!list_empty(&rbio->hash_list)) { + /* + * if we're still cached and there is no other IO + * to perform, just leave this rbio here for others + * to steal from later + */ + if (list_empty(&rbio->plug_list) && + test_bit(RBIO_CACHE_BIT, &rbio->flags)) { + keep_cache = 1; + clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + BUG_ON(!bio_list_empty(&rbio->bio_list)); + goto done; + } + + list_del_init(&rbio->hash_list); + atomic_dec(&rbio->refs); + + /* + * we use the plug list to hold all the rbios + * waiting for the chance to lock this stripe. + * hand the lock over to one of them. + */ + if (!list_empty(&rbio->plug_list)) { + struct btrfs_raid_bio *next; + struct list_head *head = rbio->plug_list.next; + + next = list_entry(head, struct btrfs_raid_bio, + plug_list); + + list_del_init(&rbio->plug_list); + + list_add(&next->hash_list, &h->hash_list); + atomic_inc(&next->refs); + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + + if (next->read_rebuild) + async_read_rebuild(next); + else { + steal_rbio(rbio, next); + async_rmw_stripe(next); + } + + goto done_nolock; + } else if (waitqueue_active(&h->wait)) { + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + wake_up(&h->wait); + goto done_nolock; + } + } +done: + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + +done_nolock: + if (!keep_cache) + remove_rbio_from_cache(rbio); +} + +static void __free_raid_bio(struct btrfs_raid_bio *rbio) +{ + int i; + + WARN_ON(atomic_read(&rbio->refs) < 0); + if (!atomic_dec_and_test(&rbio->refs)) + return; + + WARN_ON(!list_empty(&rbio->stripe_cache)); + WARN_ON(!list_empty(&rbio->hash_list)); + WARN_ON(!bio_list_empty(&rbio->bio_list)); + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) { + __free_page(rbio->stripe_pages[i]); + rbio->stripe_pages[i] = NULL; + } + } + kfree(rbio->raid_map); + kfree(rbio->bbio); + kfree(rbio); +} + +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ + unlock_stripe(rbio); + __free_raid_bio(rbio); +} + +/* + * this frees the rbio and runs through all the bios in the + * bio_list and calls end_io on them + */ +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) +{ + struct bio *cur = bio_list_get(&rbio->bio_list); + struct bio *next; + free_raid_bio(rbio); + + while (cur) { + next = cur->bi_next; + cur->bi_next = NULL; + if (uptodate) + set_bit(BIO_UPTODATE, &cur->bi_flags); + bio_endio(cur, err); + cur = next; + } +} + +/* + * end io function used by finish_rmw. When we finally + * get here, we've written a full stripe + */ +static void raid_write_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + return; + + err = 0; + + /* OK, we have read all the stripes we need to. */ + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + err = -EIO; + + rbio_orig_end_io(rbio, err, 0); + return; +} + +/* + * the read/modify/write code wants to use the original bio for + * any pages it included, and then use the rbio for everything + * else. This function decides if a given index (stripe number) + * and page number in that stripe fall inside the original bio + * or the rbio. + * + * if you set bio_list_only, you'll get a NULL back for any ranges + * that are outside the bio_list + * + * This doesn't take any refs on anything, you get a bare page pointer + * and the caller must bump refs as required. + * + * You must call index_rbio_pages once before you can trust + * the answers from this function. + */ +static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, + int index, int pagenr, int bio_list_only) +{ + int chunk_page; + struct page *p = NULL; + + chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; + + spin_lock_irq(&rbio->bio_list_lock); + p = rbio->bio_pages[chunk_page]; + spin_unlock_irq(&rbio->bio_list_lock); + + if (p || bio_list_only) + return p; + + return rbio->stripe_pages[chunk_page]; +} + +/* + * number of pages we need for the entire stripe across all the + * drives + */ +static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) +{ + unsigned long nr = stripe_len * nr_stripes; + return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +/* + * allocation and initial setup for the btrfs_raid_bio. Not + * this does not allocate any pages for rbio->pages. + */ +static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len) +{ + struct btrfs_raid_bio *rbio; + int nr_data = 0; + int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); + void *p; + + rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, + GFP_NOFS); + if (!rbio) { + kfree(raid_map); + kfree(bbio); + return ERR_PTR(-ENOMEM); + } + + bio_list_init(&rbio->bio_list); + INIT_LIST_HEAD(&rbio->plug_list); + spin_lock_init(&rbio->bio_list_lock); + INIT_LIST_HEAD(&rbio->stripe_cache); + INIT_LIST_HEAD(&rbio->hash_list); + rbio->bbio = bbio; + rbio->raid_map = raid_map; + rbio->fs_info = root->fs_info; + rbio->stripe_len = stripe_len; + rbio->nr_pages = num_pages; + rbio->faila = -1; + rbio->failb = -1; + atomic_set(&rbio->refs, 1); + + /* + * the stripe_pages and bio_pages array point to the extra + * memory we allocated past the end of the rbio + */ + p = rbio + 1; + rbio->stripe_pages = p; + rbio->bio_pages = p + sizeof(struct page *) * num_pages; + + if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) + nr_data = bbio->num_stripes - 2; + else + nr_data = bbio->num_stripes - 1; + + rbio->nr_data = nr_data; + return rbio; +} + +/* allocate pages for all the stripes in the bio, including parity */ +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int i; + struct page *page; + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) + continue; + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[i] = page; + ClearPageUptodate(page); + } + return 0; +} + +/* allocate pages for just the p/q stripes */ +static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) +{ + int i; + struct page *page; + + i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + + for (; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) + continue; + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[i] = page; + } + return 0; +} + +/* + * add a single page from a specific stripe into our list of bios for IO + * this will try to merge into existing bios if possible, and returns + * zero if all went well. + */ +int rbio_add_io_page(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct page *page, + int stripe_nr, + unsigned long page_index, + unsigned long bio_max_len) +{ + struct bio *last = bio_list->tail; + u64 last_end = 0; + int ret; + struct bio *bio; + struct btrfs_bio_stripe *stripe; + u64 disk_start; + + stripe = &rbio->bbio->stripes[stripe_nr]; + disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); + + /* if the device is missing, just fail this stripe */ + if (!stripe->dev->bdev) + return fail_rbio_index(rbio, stripe_nr); + + /* see if we can add this page onto our existing bio */ + if (last) { + last_end = (u64)last->bi_sector << 9; + last_end += last->bi_size; + + /* + * we can't merge these if they are from different + * devices or if they are not contiguous + */ + if (last_end == disk_start && stripe->dev->bdev && + test_bit(BIO_UPTODATE, &last->bi_flags) && + last->bi_bdev == stripe->dev->bdev) { + ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); + if (ret == PAGE_CACHE_SIZE) + return 0; + } + } + + /* put a new bio on the list */ + bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); + if (!bio) + return -ENOMEM; + + bio->bi_size = 0; + bio->bi_bdev = stripe->dev->bdev; + bio->bi_sector = disk_start >> 9; + set_bit(BIO_UPTODATE, &bio->bi_flags); + + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + bio_list_add(bio_list, bio); + return 0; +} + +/* + * while we're doing the read/modify/write cycle, we could + * have errors in reading pages off the disk. This checks + * for errors and if we're not able to read the page it'll + * trigger parity reconstruction. The rmw will be finished + * after we've reconstructed the failed stripes + */ +static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) +{ + if (rbio->faila >= 0 || rbio->failb >= 0) { + BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); + __raid56_parity_recover(rbio); + } else { + finish_rmw(rbio); + } +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) +{ + int index; + index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); + index += page; + return rbio->stripe_pages[index]; +} + +/* + * helper function to walk our bio list and populate the bio_pages array with + * the result. This seems expensive, but it is faster than constantly + * searching through the bio list as we setup the IO in finish_rmw or stripe + * reconstruction. + * + * This must be called before you trust the answers from page_in_rbio + */ +static void index_rbio_pages(struct btrfs_raid_bio *rbio) +{ + struct bio *bio; + u64 start; + unsigned long stripe_offset; + unsigned long page_index; + struct page *p; + int i; + + spin_lock_irq(&rbio->bio_list_lock); + bio_list_for_each(bio, &rbio->bio_list) { + start = (u64)bio->bi_sector << 9; + stripe_offset = start - rbio->raid_map[0]; + page_index = stripe_offset >> PAGE_CACHE_SHIFT; + + for (i = 0; i < bio->bi_vcnt; i++) { + p = bio->bi_io_vec[i].bv_page; + rbio->bio_pages[page_index + i] = p; + } + } + spin_unlock_irq(&rbio->bio_list_lock); +} + +/* + * this is called from one of two situations. We either + * have a full stripe from the higher layers, or we've read all + * the missing bits off disk. + * + * This will calculate the parity and then send down any + * changed blocks. + */ +static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +{ + struct btrfs_bio *bbio = rbio->bbio; + void *pointers[bbio->num_stripes]; + int stripe_len = rbio->stripe_len; + int nr_data = rbio->nr_data; + int stripe; + int pagenr; + int p_stripe = -1; + int q_stripe = -1; + struct bio_list bio_list; + struct bio *bio; + int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; + int ret; + + bio_list_init(&bio_list); + + if (bbio->num_stripes - rbio->nr_data == 1) { + p_stripe = bbio->num_stripes - 1; + } else if (bbio->num_stripes - rbio->nr_data == 2) { + p_stripe = bbio->num_stripes - 2; + q_stripe = bbio->num_stripes - 1; + } else { + BUG(); + } + + /* at this point we either have a full stripe, + * or we've read the full stripe from the drive. + * recalculate the parity and write the new results. + * + * We're not allowed to add any new bios to the + * bio list here, anyone else that wants to + * change this stripe needs to do their own rmw. + */ + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + + atomic_set(&rbio->bbio->error, 0); + + /* + * now that we've set rmw_locked, run through the + * bio list one last time and map the page pointers + * + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + index_rbio_pages(rbio); + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *p; + /* first collect one page from each data stripe */ + for (stripe = 0; stripe < nr_data; stripe++) { + p = page_in_rbio(rbio, stripe, pagenr, 0); + pointers[stripe] = kmap(p); + } + + /* then add the parity stripe */ + p = rbio_pstripe_page(rbio, pagenr); + SetPageUptodate(p); + pointers[stripe++] = kmap(p); + + if (q_stripe != -1) { + + /* + * raid6, add the qstripe and call the + * library function to fill in our p/q + */ + p = rbio_qstripe_page(rbio, pagenr); + SetPageUptodate(p); + pointers[stripe++] = kmap(p); + + raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, + pointers); + } else { + /* raid5 */ + memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + } + + + for (stripe = 0; stripe < bbio->num_stripes; stripe++) + kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + } + + /* + * time to start writing. Make bios for everything from the + * higher layers (the bio_list in our rbio) and our p/q. Ignore + * everything else. + */ + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *page; + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) + continue; + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + + ret = rbio_add_io_page(rbio, &bio_list, + page, stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); + BUG_ON(atomic_read(&bbio->stripes_pending) == 0); + + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_write_end_io; + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(WRITE, bio); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +/* + * helper to find the stripe number for a given bio. Used to figure out which + * stripe has failed. This expects the bio to correspond to a physical disk, + * so it looks up based on physical sector numbers. + */ +static int find_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + u64 physical = bio->bi_sector; + u64 stripe_start; + int i; + struct btrfs_bio_stripe *stripe; + + physical <<= 9; + + for (i = 0; i < rbio->bbio->num_stripes; i++) { + stripe = &rbio->bbio->stripes[i]; + stripe_start = stripe->physical; + if (physical >= stripe_start && + physical < stripe_start + rbio->stripe_len) { + return i; + } + } + return -1; +} + +/* + * helper to find the stripe number for a given + * bio (before mapping). Used to figure out which stripe has + * failed. This looks up based on logical block numbers. + */ +static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + u64 logical = bio->bi_sector; + u64 stripe_start; + int i; + + logical <<= 9; + + for (i = 0; i < rbio->nr_data; i++) { + stripe_start = rbio->raid_map[i]; + if (logical >= stripe_start && + logical < stripe_start + rbio->stripe_len) { + return i; + } + } + return -1; +} + +/* + * returns -EIO if we had too many failures + */ +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&rbio->bio_list_lock, flags); + + /* we already know this stripe is bad, move on */ + if (rbio->faila == failed || rbio->failb == failed) + goto out; + + if (rbio->faila == -1) { + /* first failure on this rbio */ + rbio->faila = failed; + atomic_inc(&rbio->bbio->error); + } else if (rbio->failb == -1) { + /* second failure on this rbio */ + rbio->failb = failed; + atomic_inc(&rbio->bbio->error); + } else { + ret = -EIO; + } +out: + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + + return ret; +} + +/* + * helper to fail a stripe based on a physical disk + * bio. + */ +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + int failed = find_bio_stripe(rbio, bio); + + if (failed < 0) + return -EIO; + + return fail_rbio_index(rbio, failed); +} + +/* + * this sets each page in the bio uptodate. It should only be used on private + * rbio pages, nothing that comes in from the higher layers + */ +static void set_bio_pages_uptodate(struct bio *bio) +{ + int i; + struct page *p; + + for (i = 0; i < bio->bi_vcnt; i++) { + p = bio->bi_io_vec[i].bv_page; + SetPageUptodate(p); + } +} + +/* + * end io for the read phase of the rmw cycle. All the bios here are physical + * stripe bios we've read from the disk so we can recalculate the parity of the + * stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid_rmw_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + return; + + err = 0; + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + goto cleanup; + + /* + * this will normally call finish_rmw to start our write + * but if there are any failed stripes we'll reconstruct + * from parity first + */ + validate_rbio_for_rmw(rbio); + return; + +cleanup: + + rbio_orig_end_io(rbio, -EIO, 0); +} + +static void async_rmw_stripe(struct btrfs_raid_bio *rbio) +{ + rbio->work.flags = 0; + rbio->work.func = rmw_work; + + btrfs_queue_worker(&rbio->fs_info->rmw_workers, + &rbio->work); +} + +static void async_read_rebuild(struct btrfs_raid_bio *rbio) +{ + rbio->work.flags = 0; + rbio->work.func = read_rebuild_work; + + btrfs_queue_worker(&rbio->fs_info->rmw_workers, + &rbio->work); +} + +/* + * the stripe must be locked by the caller. It will + * unlock after all the writes are done + */ +static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct btrfs_bio *bbio = rbio->bbio; + struct bio_list bio_list; + int ret; + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int pagenr; + int stripe; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + index_rbio_pages(rbio); + + atomic_set(&rbio->bbio->error, 0); + /* + * build a list of bios to read all the missing parts of this + * stripe + */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) { + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + struct page *page; + /* + * we want to find all the pages missing from + * the rbio and read them from the disk. If + * page_in_rbio finds a page in the bio list + * we don't need to read it off the stripe. + */ + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (page) + continue; + + page = rbio_stripe_page(rbio, stripe, pagenr); + /* + * the bio cache may have handed us an uptodate + * page. If so, be happy and use it + */ + if (PageUptodate(page)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, page, + stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * this can happen if others have merged with + * us, it means there is nothing left to read. + * But if there are missing devices it may not be + * safe to do the full stripe write yet. + */ + goto finish; + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&bbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_rmw_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } + /* the actual write will happen once the reads are done */ + return 0; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); + return -EIO; + +finish: + validate_rbio_for_rmw(rbio); + return 0; +} + +/* + * if the upper layers pass in a full stripe, we thank them by only allocating + * enough pages to hold the parity, and sending it all down quickly. + */ +static int full_stripe_write(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = alloc_rbio_parity_pages(rbio); + if (ret) + return ret; + + ret = lock_stripe_add(rbio); + if (ret == 0) + finish_rmw(rbio); + return 0; +} + +/* + * partial stripe writes get handed over to async helpers. + * We're really hoping to merge a few more writes into this + * rbio before calculating new parity + */ +static int partial_stripe_write(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = lock_stripe_add(rbio); + if (ret == 0) + async_rmw_stripe(rbio); + return 0; +} + +/* + * sometimes while we were reading from the drive to + * recalculate parity, enough new bios come into create + * a full stripe. So we do a check here to see if we can + * go directly to finish_rmw + */ +static int __raid56_parity_write(struct btrfs_raid_bio *rbio) +{ + /* head off into rmw land if we don't have a full stripe */ + if (!rbio_is_full(rbio)) + return partial_stripe_write(rbio); + return full_stripe_write(rbio); +} + +/* + * We use plugging call backs to collect full stripes. + * Any time we get a partial stripe write while plugged + * we collect it into a list. When the unplug comes down, + * we sort the list by logical block number and merge + * everything we can into the same rbios + */ +struct btrfs_plug_cb { + struct blk_plug_cb cb; + struct btrfs_fs_info *info; + struct list_head rbio_list; + struct btrfs_work work; +}; + +/* + * rbios on the plug list are sorted for easier merging. + */ +static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, + plug_list); + struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, + plug_list); + u64 a_sector = ra->bio_list.head->bi_sector; + u64 b_sector = rb->bio_list.head->bi_sector; + + if (a_sector < b_sector) + return -1; + if (a_sector > b_sector) + return 1; + return 0; +} + +static void run_plug(struct btrfs_plug_cb *plug) +{ + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *last = NULL; + + /* + * sort our plug list then try to merge + * everything we can in hopes of creating full + * stripes. + */ + list_sort(NULL, &plug->rbio_list, plug_cmp); + while (!list_empty(&plug->rbio_list)) { + cur = list_entry(plug->rbio_list.next, + struct btrfs_raid_bio, plug_list); + list_del_init(&cur->plug_list); + + if (rbio_is_full(cur)) { + /* we have a full stripe, send it down */ + full_stripe_write(cur); + continue; + } + if (last) { + if (rbio_can_merge(last, cur)) { + merge_rbio(last, cur); + __free_raid_bio(cur); + continue; + + } + __raid56_parity_write(last); + } + last = cur; + } + if (last) { + __raid56_parity_write(last); + } + kfree(plug); +} + +/* + * if the unplug comes from schedule, we have to push the + * work off to a helper thread + */ +static void unplug_work(struct btrfs_work *work) +{ + struct btrfs_plug_cb *plug; + plug = container_of(work, struct btrfs_plug_cb, work); + run_plug(plug); +} + +static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct btrfs_plug_cb *plug; + plug = container_of(cb, struct btrfs_plug_cb, cb); + + if (from_schedule) { + plug->work.flags = 0; + plug->work.func = unplug_work; + btrfs_queue_worker(&plug->info->rmw_workers, + &plug->work); + return; + } + run_plug(plug); +} + +/* + * our main entry point for writes from the rest of the FS. + */ +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len) +{ + struct btrfs_raid_bio *rbio; + struct btrfs_plug_cb *plug = NULL; + struct blk_plug_cb *cb; + + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + if (IS_ERR(rbio)) { + kfree(raid_map); + kfree(bbio); + return PTR_ERR(rbio); + } + bio_list_add(&rbio->bio_list, bio); + rbio->bio_list_bytes = bio->bi_size; + + /* + * don't plug on full rbios, just get them out the door + * as quickly as we can + */ + if (rbio_is_full(rbio)) + return full_stripe_write(rbio); + + cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, + sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = root->fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + } else { + return __raid56_parity_write(rbio); + } + return 0; +} + +/* + * all parity reconstruction happens here. We've read in everything + * we can find from the drives and this does the heavy lifting of + * sorting the good from the bad. + */ +static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +{ + int pagenr, stripe; + void **pointers; + int faila = -1, failb = -1; + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + struct page *page; + int err; + int i; + + pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), + GFP_NOFS); + if (!pointers) { + err = -ENOMEM; + goto cleanup_io; + } + + faila = rbio->faila; + failb = rbio->failb; + + if (rbio->read_rebuild) { + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + } + + index_rbio_pages(rbio); + + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + /* setup our array of pointers with pages + * from each stripe + */ + for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { + /* + * if we're rebuilding a read, we have to use + * pages from the bio list + */ + if (rbio->read_rebuild && + (stripe == faila || stripe == failb)) { + page = page_in_rbio(rbio, stripe, pagenr, 0); + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + pointers[stripe] = kmap(page); + } + + /* all raid6 handling here */ + if (rbio->raid_map[rbio->bbio->num_stripes - 1] == + RAID6_Q_STRIPE) { + + /* + * single failure, rebuild from parity raid5 + * style + */ + if (failb < 0) { + if (faila == rbio->nr_data) { + /* + * Just the P stripe has failed, without + * a bad data or Q stripe. + * TODO, we should redo the xor here. + */ + err = -EIO; + goto cleanup; + } + /* + * a single failure in raid6 is rebuilt + * in the pstripe code below + */ + goto pstripe; + } + + /* make sure our ps and qs are in order */ + if (faila > failb) { + int tmp = failb; + failb = faila; + faila = tmp; + } + + /* if the q stripe is failed, do a pstripe reconstruction + * from the xors. + * If both the q stripe and the P stripe are failed, we're + * here due to a crc mismatch and we can't give them the + * data they want + */ + if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->raid_map[faila] == RAID5_P_STRIPE) { + err = -EIO; + goto cleanup; + } + /* + * otherwise we have one bad data stripe and + * a good P stripe. raid5! + */ + goto pstripe; + } + + if (rbio->raid_map[failb] == RAID5_P_STRIPE) { + raid6_datap_recov(rbio->bbio->num_stripes, + PAGE_SIZE, faila, pointers); + } else { + raid6_2data_recov(rbio->bbio->num_stripes, + PAGE_SIZE, faila, failb, + pointers); + } + } else { + void *p; + + /* rebuild from P stripe here (raid5 or raid6) */ + BUG_ON(failb != -1); +pstripe: + /* Copy parity block into failed block to start with */ + memcpy(pointers[faila], + pointers[rbio->nr_data], + PAGE_CACHE_SIZE); + + /* rearrange the pointer array */ + p = pointers[faila]; + for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) + pointers[stripe] = pointers[stripe + 1]; + pointers[rbio->nr_data - 1] = p; + + /* xor in the rest */ + run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); + } + /* if we're doing this rebuild as part of an rmw, go through + * and set all of our private rbio pages in the + * failed stripes as uptodate. This way finish_rmw will + * know they can be trusted. If this was a read reconstruction, + * other endio functions will fiddle the uptodate bits + */ + if (!rbio->read_rebuild) { + for (i = 0; i < nr_pages; i++) { + if (faila != -1) { + page = rbio_stripe_page(rbio, faila, i); + SetPageUptodate(page); + } + if (failb != -1) { + page = rbio_stripe_page(rbio, failb, i); + SetPageUptodate(page); + } + } + } + for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { + /* + * if we're rebuilding a read, we have to use + * pages from the bio list + */ + if (rbio->read_rebuild && + (stripe == faila || stripe == failb)) { + page = page_in_rbio(rbio, stripe, pagenr, 0); + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + kunmap(page); + } + } + + err = 0; +cleanup: + kfree(pointers); + +cleanup_io: + + if (rbio->read_rebuild) { + if (err == 0) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + rbio_orig_end_io(rbio, err, err == 0); + } else if (err == 0) { + rbio->faila = -1; + rbio->failb = -1; + finish_rmw(rbio); + } else { + rbio_orig_end_io(rbio, err, 0); + } +} + +/* + * This is called only for stripes we've read from disk to + * reconstruct the parity. + */ +static void raid_recover_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + /* + * we only read stripe pages off the disk, set them + * up to date if there were no errors + */ + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + return; + + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + rbio_orig_end_io(rbio, -EIO, 0); + else + __raid_recover_end_io(rbio); +} + +/* + * reads everything we need off the disk to reconstruct + * the parity. endio handlers trigger final reconstruction + * when the IO is done. + * + * This is used both for reads from the higher layers and for + * parity construction required to finish a rmw cycle. + */ +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct btrfs_bio *bbio = rbio->bbio; + struct bio_list bio_list; + int ret; + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int pagenr; + int stripe; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + atomic_set(&rbio->bbio->error, 0); + + /* + * read everything that hasn't failed. Thanks to the + * stripe cache, it is possible that some or all of these + * pages are going to be uptodate. + */ + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + if (rbio->faila == stripe || + rbio->failb == stripe) + continue; + + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + struct page *p; + + /* + * the rmw code may have already read this + * page in + */ + p = rbio_stripe_page(rbio, stripe, pagenr); + if (PageUptodate(p)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, + rbio_stripe_page(rbio, stripe, pagenr), + stripe, pagenr, rbio->stripe_len); + if (ret < 0) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * we might have no bios to read just because the pages + * were up to date, or we might have no bios to read because + * the devices were gone. + */ + if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { + __raid_recover_end_io(rbio); + goto out; + } else { + goto cleanup; + } + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&bbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_recover_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } +out: + return 0; + +cleanup: + if (rbio->read_rebuild) + rbio_orig_end_io(rbio, -EIO, 0); + return -EIO; +} + +/* + * the main entry point for reads from the higher layers. This + * is really only called when the normal read path had a failure, + * so we assume the bio they send down corresponds to a failed part + * of the drive. + */ +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, int mirror_num) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + if (IS_ERR(rbio)) { + return PTR_ERR(rbio); + } + + rbio->read_rebuild = 1; + bio_list_add(&rbio->bio_list, bio); + rbio->bio_list_bytes = bio->bi_size; + + rbio->faila = find_logical_bio_stripe(rbio, bio); + if (rbio->faila == -1) { + BUG(); + kfree(rbio); + return -EIO; + } + + /* + * reconstruct from the q stripe if they are + * asking for mirror 3 + */ + if (mirror_num == 3) + rbio->failb = bbio->num_stripes - 2; + + ret = lock_stripe_add(rbio); + + /* + * __raid56_parity_recover will end the bio with + * any errors it hits. We don't want to return + * its error value up the stack because our caller + * will end up calling bio_endio with any nonzero + * return + */ + if (ret == 0) + __raid56_parity_recover(rbio); + /* + * our rbio has been added to the list of + * rbios that will be handled after the + * currently lock owner is done + */ + return 0; + +} + +static void rmw_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + raid56_rmw_stripe(rbio); +} + +static void read_rebuild_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + __raid56_parity_recover(rbio); +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h new file mode 100644 index 000000000000..ea5d73bfdfbe --- /dev/null +++ b/fs/btrfs/raid56.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2012 Fusion-io All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_RAID56__ +#define __BTRFS_RAID56__ +static inline int nr_parity_stripes(struct map_lookup *map) +{ + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 1; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return 2; + else + return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ + return map->num_stripes - nr_parity_stripes(map); +} +#define RAID5_P_STRIPE ((u64)-2) +#define RAID6_Q_STRIPE ((u64)-1) + +#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ + ((x) == RAID6_Q_STRIPE)) + +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, int mirror_num); +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len); + +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); +#endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 300e09ac3659..b67171e6d688 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1269,6 +1269,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del) } spin_unlock(&rc->reloc_root_tree.lock); + if (!node) + return 0; BUG_ON((struct btrfs_root *)node->data != root); if (!del) { @@ -2238,13 +2240,28 @@ again: } static noinline_for_stack +void free_reloc_roots(struct list_head *list) +{ + struct btrfs_root *reloc_root; + + while (!list_empty(list)) { + reloc_root = list_entry(list->next, struct btrfs_root, + root_list); + __update_reloc_root(reloc_root, 1); + free_extent_buffer(reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + kfree(reloc_root); + } +} + +static noinline_for_stack int merge_reloc_roots(struct reloc_control *rc) { struct btrfs_root *root; struct btrfs_root *reloc_root; LIST_HEAD(reloc_roots); int found = 0; - int ret; + int ret = 0; again: root = rc->extent_root; @@ -2270,20 +2287,33 @@ again: BUG_ON(root->reloc_root != reloc_root); ret = merge_reloc_root(rc, root); - BUG_ON(ret); + if (ret) + goto out; } else { list_del_init(&reloc_root->root_list); } ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); - BUG_ON(ret < 0); + if (ret < 0) { + if (list_empty(&reloc_root->root_list)) + list_add_tail(&reloc_root->root_list, + &reloc_roots); + goto out; + } } if (found) { found = 0; goto again; } +out: + if (ret) { + btrfs_std_error(root->fs_info, ret); + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); + } + BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); - return 0; + return ret; } static void free_block_list(struct rb_root *blocks) @@ -2818,8 +2848,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, int err = 0; path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + err = -ENOMEM; + goto out_path; + } rb_node = rb_first(blocks); while (rb_node) { @@ -2858,10 +2890,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, rb_node = rb_next(rb_node); } out: - free_block_list(blocks); err = finish_pending_nodes(trans, rc, path, err); btrfs_free_path(path); +out_path: + free_block_list(blocks); return err; } @@ -3017,7 +3050,7 @@ static int relocate_file_extent_cluster(struct inode *inode, } } - page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); @@ -3472,7 +3505,7 @@ out: } /* - * hepler to find all tree blocks that reference a given data extent + * helper to find all tree blocks that reference a given data extent */ static noinline_for_stack int add_data_references(struct reloc_control *rc, @@ -3566,7 +3599,7 @@ int add_data_references(struct reloc_control *rc, } /* - * hepler to find next unprocessed extent + * helper to find next unprocessed extent */ static noinline_for_stack int find_next_extent(struct btrfs_trans_handle *trans, @@ -3698,7 +3731,15 @@ int prepare_to_relocate(struct reloc_control *rc) set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + /* + * extent tree is not a ref_cow tree and has no reloc_root to + * cleanup. And callers are responsible to free the above + * block rsv. + */ + return PTR_ERR(trans); + } btrfs_commit_transaction(trans, rc->extent_root); return 0; } @@ -3730,7 +3771,11 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) while (1) { progress++; trans = btrfs_start_transaction(rc->extent_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + break; + } restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans, rc->extent_root); @@ -4264,14 +4309,9 @@ int btrfs_recover_relocation(struct btrfs_root *root) out_free: kfree(rc); out: - while (!list_empty(&reloc_roots)) { - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); - list_del(&reloc_root->root_list); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - kfree(reloc_root); - } + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); + btrfs_free_path(path); if (err == 0) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bdbb94f245c9..85e072b956d5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -28,6 +28,7 @@ #include "dev-replace.h" #include "check-integrity.h" #include "rcu-string.h" +#include "raid56.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -541,7 +542,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size_nr(eb, path->slots[0]); - btrfs_release_path(path); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -557,7 +557,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret < 0 ? -1 : ref_level, ret < 0 ? -1 : ref_root); } while (ret != 1); + btrfs_release_path(path); } else { + btrfs_release_path(path); swarn.path = path; swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, @@ -580,20 +582,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) int corrected = 0; struct btrfs_key key; struct inode *inode = NULL; + struct btrfs_fs_info *fs_info; u64 end = offset + PAGE_SIZE - 1; struct btrfs_root *local_root; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); - if (IS_ERR(local_root)) + + fs_info = fixup->root->fs_info; + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; - inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -606,7 +617,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) } if (PageUptodate(page)) { - struct btrfs_fs_info *fs_info; if (PageDirty(page)) { /* * we need to write the data to the defect sector. the @@ -2246,6 +2256,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_device *extent_dev; int extent_mirror_num; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + if (num >= nr_data_stripes(map)) { + return 0; + } + } + nstripes = length; offset = 0; do_div(nstripes, map->stripe_len); @@ -2700,7 +2717,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, int ret; struct btrfs_root *root = sctx->dev_root; - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EIO; gen = root->fs_info->last_trans_committed; @@ -3180,18 +3197,25 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) u64 physical_for_dev_replace; u64 len; struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + local_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(local_root)) + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 321b7fb4e441..c85e7c6b4598 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -85,6 +85,7 @@ struct send_ctx { u32 send_max_size; u64 total_send_size; u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; + u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ struct vfsmount *mnt; @@ -3709,6 +3710,39 @@ out: return ret; } +/* + * Send an update extent command to user space. + */ +static int send_update_extent(struct send_ctx *sctx, + u64 offset, u32 len) +{ + int ret = 0; + struct fs_path *p; + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(sctx, p); + return ret; +} + static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, @@ -3744,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx, goto out; } - if (!clone_root) { + if (clone_root) { + ret = send_clone(sctx, offset, len, clone_root); + } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { + ret = send_update_extent(sctx, offset, len); + } else { while (pos < len) { l = len - pos; if (l > BTRFS_SEND_READ_SIZE) @@ -3757,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx, pos += ret; } ret = 0; - } else { - ret = send_clone(sctx, offset, len, clone_root); } - out: return ret; } @@ -3910,12 +3945,10 @@ static int is_extent_unchanged(struct send_ctx *sctx, found_key.type != key.type) { key.offset += right_len; break; - } else { - if (found_key.offset != key.offset + right_len) { - /* Should really not happen */ - ret = -EIO; - goto out; - } + } + if (found_key.offset != key.offset + right_len) { + ret = 0; + goto out; } key = found_key; } @@ -4536,7 +4569,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) struct btrfs_fs_info *fs_info; struct btrfs_ioctl_send_args *arg = NULL; struct btrfs_key key; - struct file *filp = NULL; struct send_ctx *sctx = NULL; u32 i; u64 *clone_sources_tmp = NULL; @@ -4544,7 +4576,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - send_root = BTRFS_I(fdentry(mnt_file)->d_inode)->root; + send_root = BTRFS_I(file_inode(mnt_file))->root; fs_info = send_root->fs_info; arg = memdup_user(arg_, sizeof(*arg)); @@ -4561,6 +4593,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) { + ret = -EINVAL; + goto out; + } + sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); if (!sctx) { ret = -ENOMEM; @@ -4572,6 +4609,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); INIT_LIST_HEAD(&sctx->name_cache_list); + sctx->flags = arg->flags; + sctx->send_filp = fget(arg->send_fd); if (IS_ERR(sctx->send_filp)) { ret = PTR_ERR(sctx->send_filp); @@ -4673,8 +4712,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; out: - if (filp) - fput(filp); kfree(arg); vfree(clone_sources_tmp); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 1bf4f32fd4ef..8bb18f7ccaa6 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -86,6 +86,7 @@ enum btrfs_send_cmd { BTRFS_SEND_C_UTIMES, BTRFS_SEND_C_END, + BTRFS_SEND_C_UPDATE_EXTENT, __BTRFS_SEND_C_MAX, }; #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d8982e9601d3..f6b88595f858 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -41,13 +41,13 @@ #include <linux/slab.h> #include <linux/cleancache.h> #include <linux/ratelimit.h> +#include <linux/btrfs.h> #include "compat.h" #include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "xattr.h" #include "volumes.h" @@ -63,8 +63,7 @@ static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; -static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, - char nbuf[16]) +static const char *btrfs_decode_error(int errno, char nbuf[16]) { char *errstr = NULL; @@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info) * today we only save the error info into ram. Long term we'll * also send it down to the disk */ - fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); } static void save_error_info(struct btrfs_fs_info *fs_info) @@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (sb->s_flags & MS_RDONLY) return; - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); /* @@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, struct super_block *sb = fs_info->sb; char nbuf[16]; const char *errstr; - va_list args; - va_start(args, fmt); /* * Special case: if the error is EROFS, and we're already @@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) return; - errstr = btrfs_decode_error(fs_info, errno, nbuf); + errstr = btrfs_decode_error(errno, nbuf); if (fmt) { - struct va_format vaf = { - .fmt = fmt, - .va = &args, - }; + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", sb->s_id, function, line, errstr, &vaf); + va_end(args); } else { printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); @@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, save_error_info(fs_info); btrfs_handle_error(fs_info); } - va_end(args); } static const char * const logtypes[] = { @@ -261,7 +260,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, char nbuf[16]; const char *errstr; - errstr = btrfs_decode_error(root->fs_info, errno, nbuf); + errstr = btrfs_decode_error(errno, nbuf); btrfs_printk(root->fs_info, "%s:%d: Aborting unused transaction(%s).\n", function, line, errstr); @@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, va_start(args, fmt); vaf.va = &args; - errstr = btrfs_decode_error(fs_info, errno, nbuf); - if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) + errstr = btrfs_decode_error(errno, nbuf); + if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", s_id, function, line, &vaf, errstr); @@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_compress_force: case Opt_compress_force_type: compress_force = true; + /* Fallthrough */ case Opt_compress: case Opt_compress_type: if (token == Opt_compress || @@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_alloc_start: num = match_strdup(&args[0]); if (num) { + mutex_lock(&info->chunk_mutex); info->alloc_start = memparse(num, NULL); + mutex_unlock(&info->chunk_mutex); kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", @@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_wait_ordered_extents(root, 0); - trans = btrfs_attach_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ if (PTR_ERR(trans) == -ENOENT) @@ -1200,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, new_pool_size); } +static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info, + unsigned long old_opts, int flags) +{ + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || + (flags & MS_RDONLY))) { + /* wait for any defraggers to finish */ + wait_event(fs_info->transaction_wait, + (atomic_read(&fs_info->defrag_running) == 0)); + if (flags & MS_RDONLY) + sync_filesystem(fs_info->sb); + } +} + +static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, + unsigned long old_opts) +{ + /* + * We need cleanup all defragable inodes if the autodefragment is + * close or the fs is R/O. + */ + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || + (fs_info->sb->s_flags & MS_RDONLY))) { + btrfs_cleanup_defrag_inodes(fs_info); + } + + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); +} + static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1213,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; + btrfs_remount_prepare(fs_info, old_opts, *flags); + ret = btrfs_parse_options(root, data); if (ret) { ret = -EINVAL; @@ -1223,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) fs_info->thread_pool_size, old_thread_pool_size); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) - return 0; + goto out; if (*flags & MS_RDONLY) { /* @@ -1278,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } sb->s_flags &= ~MS_RDONLY; } - +out: + btrfs_remount_cleanup(fs_info, old_opts); return 0; restore: @@ -1289,10 +1326,13 @@ restore: fs_info->mount_opt = old_opts; fs_info->compress_type = old_compress_type; fs_info->max_inline = old_max_inline; + mutex_lock(&fs_info->chunk_mutex); fs_info->alloc_start = old_alloc_start; + mutex_unlock(&fs_info->chunk_mutex); btrfs_resize_thread_pool(fs_info, old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; + btrfs_remount_cleanup(fs_info, old_opts); return ret; } @@ -1518,6 +1558,7 @@ static struct file_system_type btrfs_fs_type = { .kill_sb = btrfs_kill_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("btrfs"); /* * used by btrfsctl to scan devices when no FS is mounted @@ -1559,7 +1600,7 @@ static int btrfs_freeze(struct super_block *sb) struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_sb(sb)->tree_root; - trans = btrfs_attach_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ if (PTR_ERR(trans) == -ENOENT) @@ -1684,10 +1725,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_delayed_inode; - err = btrfs_interface_init(); + err = btrfs_delayed_ref_init(); if (err) goto free_auto_defrag; + err = btrfs_interface_init(); + if (err) + goto free_delayed_ref; + err = register_filesystem(&btrfs_fs_type); if (err) goto unregister_ioctl; @@ -1699,6 +1744,8 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); +free_delayed_ref: + btrfs_delayed_ref_exit(); free_auto_defrag: btrfs_auto_defrag_exit(); free_delayed_inode: @@ -1720,6 +1767,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { btrfs_destroy_cachep(); + btrfs_delayed_ref_exit(); btrfs_auto_defrag_exit(); btrfs_delayed_inode_exit(); ordered_data_exit(); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index daac9ae6d731..5b326cd60a4a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -21,7 +21,6 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> -#include <linux/module.h> #include <linux/kobject.h> #include "ctree.h" diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f15494699f3b..50767bbaad6c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction) if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(transaction->delayed_refs.root.rb_node); - memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } } @@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root) root->commit_root = btrfs_root_node(root); } +static inline int can_join_transaction(struct btrfs_transaction *trans, + int type) +{ + return !(trans->in_commit && + type != TRANS_JOIN && + type != TRANS_JOIN_NOLOCK); +} + /* * either allocate a new transaction or hop into the existing one */ @@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type) spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); return -EROFS; } @@ -86,6 +93,10 @@ loop: spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } + if (!can_join_transaction(cur_trans, type)) { + spin_unlock(&fs_info->trans_lock); + return -EBUSY; + } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; @@ -112,9 +123,8 @@ loop: * to redo the trans_no_join checks above */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); - cur_trans = fs_info->running_transaction; goto loop; - } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); kmem_cache_free(btrfs_transaction_cachep, cur_trans); return -EROFS; @@ -156,8 +166,12 @@ loop: spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); + atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); + atomic_set(&cur_trans->delayed_refs.ref_seq, 0); + init_waitqueue_head(&cur_trans->delayed_refs.wait); INIT_LIST_HEAD(&cur_trans->pending_snapshots); + INIT_LIST_HEAD(&cur_trans->ordered_operations); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -302,7 +316,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, int ret; u64 qgroup_reserved = 0; - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); if (current->journal_info) { @@ -333,12 +347,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, &root->fs_info->trans_block_rsv, num_bytes, flush); if (ret) - return ERR_PTR(ret); + goto reserve_fail; } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); - if (!h) - return ERR_PTR(-ENOMEM); + if (!h) { + ret = -ENOMEM; + goto alloc_fail; + } /* * If we are JOIN_NOLOCK we're already committing a transaction and @@ -358,18 +374,17 @@ again: do { ret = join_transaction(root, type); - if (ret == -EBUSY) + if (ret == -EBUSY) { wait_current_trans(root); + if (unlikely(type == TRANS_ATTACH)) + ret = -ENOENT; + } } while (ret == -EBUSY); if (ret < 0) { /* We must get the transaction if we are JOIN_NOLOCK. */ BUG_ON(type == TRANS_JOIN_NOLOCK); - - if (type < TRANS_JOIN_NOLOCK) - sb_end_intwrite(root->fs_info->sb); - kmem_cache_free(btrfs_trans_handle_cachep, h); - return ERR_PTR(ret); + goto join_fail; } cur_trans = root->fs_info->running_transaction; @@ -385,9 +400,10 @@ again: h->block_rsv = NULL; h->orig_rsv = NULL; h->aborted = 0; - h->qgroup_reserved = qgroup_reserved; + h->qgroup_reserved = 0; h->delayed_ref_elem.seq = 0; h->type = type; + h->allocating_chunk = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); @@ -403,6 +419,7 @@ again: h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } + h->qgroup_reserved = qgroup_reserved; got_it: btrfs_record_root_in_trans(h, root); @@ -410,6 +427,19 @@ got_it: if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; return h; + +join_fail: + if (type < TRANS_JOIN_NOLOCK) + sb_end_intwrite(root->fs_info->sb); + kmem_cache_free(btrfs_trans_handle_cachep, h); +alloc_fail: + if (num_bytes) + btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, + num_bytes); +reserve_fail: + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); + return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, @@ -441,11 +471,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root return start_transaction(root, 0, TRANS_USERSPACE, 0); } +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is used when we want to commit the current the transaction, but + * don't want to start a new one. + * + * Note: If this function return -ENOENT, it just means there is no + * running transaction. But it is possible that the inactive transaction + * is still in the memory, not fully on disk. If you hope there is no + * inactive transaction in the fs when -ENOENT is returned, you should + * invoke + * btrfs_attach_transaction_barrier() + */ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_ATTACH, 0); } +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is similar to the above function, the differentia is this one + * will wait for all the inactive transactions until they fully + * complete. + */ +struct btrfs_trans_handle * +btrfs_attach_transaction_barrier(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + + trans = start_transaction(root, 0, TRANS_ATTACH, 0); + if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) + btrfs_wait_for_commit(root, 0); + + return trans; +} + /* wait for a transaction commit to be fully complete */ static noinline void wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) @@ -563,21 +625,20 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - /* - * the same root has to be passed to start_transaction and - * end_transaction. Subvolume quota depends on this. - */ - WARN_ON(trans->root != root); if (trans->qgroup_reserved) { - btrfs_qgroup_free(root, trans->qgroup_reserved); + /* + * the same root has to be passed here between start_transaction + * and end_transaction. Subvolume quota depends on this. + */ + btrfs_qgroup_free(trans->root, trans->qgroup_reserved); trans->qgroup_reserved = 0; } if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - while (count < 2) { + while (count < 1) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (cur && @@ -589,6 +650,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } count++; } + btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -634,12 +696,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_run_delayed_iputs(root); if (trans->aborted || - root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) err = -EIO; - } assert_qgroups_uptodate(trans); - memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); return err; } @@ -686,7 +746,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_state *cached_state = NULL; u64 start = 0; u64 end; + struct blk_plug plug; + blk_start_plug(&plug); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, @@ -700,6 +762,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, } if (err) werr = err; + blk_finish_plug(&plug); return werr; } @@ -950,10 +1013,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, } /* - * defrag a given btree. If cacheonly == 1, this won't read from the disk, - * otherwise every leaf in the btree is read and defragged. + * defrag a given btree. + * Every leaf in the btree is read and defragged. */ -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) +int btrfs_defrag_root(struct btrfs_root *root) { struct btrfs_fs_info *info = root->fs_info; struct btrfs_trans_handle *trans; @@ -967,7 +1030,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_defrag_leaves(trans, root, cacheonly); + ret = btrfs_defrag_leaves(trans, root); btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(info->tree_root); @@ -975,6 +1038,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) break; + + if (btrfs_defrag_cancelled(root->fs_info)) { + printk(KERN_DEBUG "btrfs: defrag_root cancelled\n"); + ret = -EAGAIN; + break; + } } root->defrag_running = 0; return ret; @@ -982,7 +1051,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) /* * new snapshots need to be created at a very specific time in the - * transaction commit. This does the actual creation + * transaction commit. This does the actual creation. + * + * Note: + * If the error which may affect the commitment of the current transaction + * happens, we should return the error number. If the error which just affect + * the creation of the pending snapshots, just return 0. */ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -997,12 +1071,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct inode *parent_inode; struct btrfs_path *path; struct btrfs_dir_item *dir_item; - struct dentry *parent; struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; struct timespec cur_time = CURRENT_TIME; - int ret; + int ret = 0; u64 to_reserve = 0; u64 index = 0; u64 objectid; @@ -1011,40 +1084,36 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { - ret = pending->error = -ENOMEM; - goto path_alloc_fail; + pending->error = -ENOMEM; + return 0; } new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { - ret = pending->error = -ENOMEM; + pending->error = -ENOMEM; goto root_item_alloc_fail; } - ret = btrfs_find_free_objectid(tree_root, &objectid); - if (ret) { - pending->error = ret; + pending->error = btrfs_find_free_objectid(tree_root, &objectid); + if (pending->error) goto no_free_objectid; - } btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add(root, &pending->block_rsv, - to_reserve, - BTRFS_RESERVE_NO_FLUSH); - if (ret) { - pending->error = ret; + pending->error = btrfs_block_rsv_add(root, + &pending->block_rsv, + to_reserve, + BTRFS_RESERVE_NO_FLUSH); + if (pending->error) goto no_free_objectid; - } } - ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - pending->error = ret; + pending->error = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (pending->error) goto no_free_objectid; - } key.objectid = objectid; key.offset = (u64)-1; @@ -1052,10 +1121,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; + trans->bytes_reserved = trans->block_rsv->reserved; dentry = pending->dentry; - parent = dget_parent(dentry); - parent_inode = parent->d_inode; + parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); @@ -1072,7 +1141,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry->d_name.len, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; - goto fail; + goto dir_item_existed; } else if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); btrfs_abort_transaction(trans, root, ret); @@ -1203,14 +1272,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, root, ret); fail: - dput(parent); + pending->error = ret; +dir_item_existed: trans->block_rsv = rsv; + trans->bytes_reserved = 0; no_free_objectid: kfree(new_root_item); root_item_alloc_fail: btrfs_free_path(path); -path_alloc_fail: - btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); return ret; } @@ -1220,12 +1289,17 @@ path_alloc_fail: static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - struct btrfs_pending_snapshot *pending; + struct btrfs_pending_snapshot *pending, *next; struct list_head *head = &trans->transaction->pending_snapshots; + int ret = 0; - list_for_each_entry(pending, head, list) - create_pending_snapshot(trans, fs_info, pending); - return 0; + list_for_each_entry_safe(pending, next, head, list) { + list_del(&pending->list); + ret = create_pending_snapshot(trans, fs_info, pending); + if (ret) + break; + } + return ret; } static void update_super_roots(struct btrfs_root *root) @@ -1296,13 +1370,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, struct btrfs_async_commit { struct btrfs_trans_handle *newtrans; struct btrfs_root *root; - struct delayed_work work; + struct work_struct work; }; static void do_async_commit(struct work_struct *work) { struct btrfs_async_commit *ac = - container_of(work, struct btrfs_async_commit, work.work); + container_of(work, struct btrfs_async_commit, work); /* * We've got freeze protection passed with the transaction. @@ -1330,7 +1404,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, if (!ac) return -ENOMEM; - INIT_DELAYED_WORK(&ac->work, do_async_commit); + INIT_WORK(&ac->work, do_async_commit); ac->root = root; ac->newtrans = btrfs_join_transaction(root); if (IS_ERR(ac->newtrans)) { @@ -1354,7 +1428,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1, _THIS_IP_); - schedule_delayed_work(&ac->work, 0); + schedule_work(&ac->work); /* wait for transaction to start and unblock */ if (wait_for_unblock) @@ -1374,16 +1448,29 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, int err) { struct btrfs_transaction *cur_trans = trans->transaction; + DEFINE_WAIT(wait); WARN_ON(trans->use_count > 1); btrfs_abort_transaction(trans, root, err); spin_lock(&root->fs_info->trans_lock); + + if (list_empty(&cur_trans->list)) { + spin_unlock(&root->fs_info->trans_lock); + btrfs_end_transaction(trans, root); + return; + } + list_del_init(&cur_trans->list); if (cur_trans == root->fs_info->running_transaction) { + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + spin_lock(&root->fs_info->trans_lock); root->fs_info->running_transaction = NULL; - root->fs_info->trans_no_join = 0; } spin_unlock(&root->fs_info->trans_lock); @@ -1417,7 +1504,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, } if (flush_on_commit || snap_pending) { - btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_start_delalloc_inodes(root, 1); + if (ret) + return ret; btrfs_wait_ordered_extents(root, 1); } @@ -1439,9 +1528,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, * it here and no for sure that nothing new will be added * to the list */ - btrfs_run_ordered_operations(root, 1); + ret = btrfs_run_ordered_operations(trans, root, 1); - return 0; + return ret; } /* @@ -1462,27 +1551,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int should_grow = 0; unsigned long now = get_seconds(); - ret = btrfs_run_ordered_operations(root, 0); + ret = btrfs_run_ordered_operations(trans, root, 0); if (ret) { btrfs_abort_transaction(trans, root, ret); - goto cleanup_transaction; + btrfs_end_transaction(trans, root); + return ret; } /* Stop the commit early if ->aborted is set */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; - goto cleanup_transaction; + btrfs_end_transaction(trans, root); + return ret; } /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ ret = btrfs_run_delayed_refs(trans, root, 0); - if (ret) - goto cleanup_transaction; + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } cur_trans = trans->transaction; @@ -1496,8 +1593,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_create_pending_block_groups(trans, root); ret = btrfs_run_delayed_refs(trans, root, 0); - if (ret) - goto cleanup_transaction; + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } spin_lock(&cur_trans->commit_lock); if (cur_trans->in_commit) { @@ -1761,6 +1860,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cleanup_transaction: btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); // WARN_ON(1); if (current->journal_info == trans) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0e8aa1e6c287..3c8e0d25c8e4 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -43,6 +43,7 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; + struct list_head ordered_operations; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; @@ -68,6 +69,7 @@ struct btrfs_trans_handle { struct btrfs_block_rsv *orig_rsv; short aborted; short adding_csums; + bool allocating_chunk; enum btrfs_trans_type type; /* * this root is only needed to validate that the root passed to @@ -82,11 +84,13 @@ struct btrfs_trans_handle { struct btrfs_pending_snapshot { struct dentry *dentry; + struct inode *dir; struct btrfs_root *root; struct btrfs_root *snap; struct btrfs_qgroup_inherit *inherit; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; + u64 qgroup_reserved; /* extra metadata reseration for relocation */ int error; bool readonly; @@ -110,13 +114,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction_barrier( + struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_add_dead_root(struct btrfs_root *root); -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); +int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_old_snapshots(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 3b580ee8ab1d..94e05c1f118a 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -23,13 +23,14 @@ #include "transaction.h" #include "locking.h" -/* defrag all the leaves in a given btree. If cache_only == 1, don't read - * things from disk, otherwise read all the leaves and try to get key order to +/* + * Defrag all the leaves in a given btree. + * Read all the leaves and try to get key order to * better reflect disk order */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int cache_only) + struct btrfs_root *root) { struct btrfs_path *path = NULL; struct btrfs_key key; @@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, u64 last_ret = 0; u64 min_trans = 0; - if (cache_only) - goto out; - if (root->fs_info->extent_root == root) { /* * there's recursion here right now in the tree locking, @@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, } path->keep_locks = 1; - if (cache_only) - min_trans = root->defrag_trans_start; - ret = btrfs_search_forward(root, &key, NULL, path, - cache_only, min_trans); + ret = btrfs_search_forward(root, &key, NULL, path, min_trans); if (ret < 0) goto out; if (ret > 0) { @@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, + next_key_ret = btrfs_find_next_key(root, path, &key, 1, min_trans); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, - cache_only, &last_ret, + &last_ret, &root->defrag_progress); if (ret) { WARN_ON(ret == -EAGAIN); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9027bb1e7466..451fad96ecd1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log, struct walk_control *wc, u64 gen) { if (wc->pin) - btrfs_pin_extent_for_log_replay(wc->trans, - log->fs_info->extent_root, + btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, eb->start, eb->len); if (btrfs_buffer_uptodate(eb, gen, 0)) { @@ -485,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_key *key) { int found_type; - u64 mask = root->sectorsize - 1; u64 extent_end; u64 start = key->offset; u64 saved_nbytes; @@ -502,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = start + btrfs_file_extent_num_bytes(eb, item); else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size = btrfs_file_extent_inline_len(eb, item); - extent_end = (start + size + mask) & ~mask; + extent_end = ALIGN(start + size, root->sectorsize); } else { ret = 0; goto out; @@ -1384,7 +1382,10 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { - btrfs_inc_nlink(inode); + if (!inode->i_nlink) + set_nlink(inode, 1); + else + btrfs_inc_nlink(inode); ret = btrfs_update_inode(trans, root, inode); } else if (ret == -EEXIST) { ret = 0; @@ -2281,6 +2282,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, unsigned long log_transid = 0; mutex_lock(&root->log_mutex); + log_transid = root->log_transid; index1 = root->log_transid % 2; if (atomic_read(&root->log_commit[index1])) { wait_log_commit(trans, root, root->log_transid); @@ -2308,11 +2310,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* bail out if we need to do a full commit */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { ret = -EAGAIN; + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); goto out; } - log_transid = root->log_transid; if (log_transid % 2 == 0) mark = EXTENT_DIRTY; else @@ -2324,6 +2326,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { btrfs_abort_transaction(trans, root, ret); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); goto out; } @@ -2363,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } root->fs_info->last_trans_log_full_commit = trans->transid; btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out; @@ -2373,6 +2377,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = 0; goto out; @@ -2392,6 +2397,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; @@ -2402,10 +2408,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); if (ret) { btrfs_abort_transaction(trans, root, ret); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_wait_logged_extents(log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); @@ -2461,8 +2469,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans, .process_func = process_one_buffer }; - ret = walk_log_tree(trans, log, &wc); - BUG_ON(ret); + if (trans) { + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + } while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -2475,6 +2485,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } + /* + * We may have short-circuited the log tree with the full commit logic + * and left ordered extents on our list, so clear these out to keep us + * from leaking inodes and memory. + */ + btrfs_free_logged_extents(log, 0); + btrfs_free_logged_extents(log, 1); + free_extent_buffer(log->node); kfree(log); } @@ -2724,7 +2742,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->keep_locks = 1; ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, trans->transid); + path, trans->transid); /* * we didn't find anything from this transaction, see if there @@ -3271,16 +3289,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; + struct btrfs_ordered_extent *ordered; struct list_head ordered_sums; struct btrfs_map_token token; struct btrfs_key key; - u64 csum_offset = em->mod_start - em->start; - u64 csum_len = em->mod_len; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; + u64 csum_offset; + u64 csum_len; u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; + int index = log->log_transid % 2; bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; +insert: INIT_LIST_HEAD(&ordered_sums); btrfs_init_map_token(&token); key.objectid = btrfs_ino(inode); @@ -3296,6 +3319,23 @@ static int log_one_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + /* + * If we are overwriting an inline extent with a real one then we need + * to just delete the inline extent as it may not be large enough to + * have the entire file_extent_item. + */ + if (ret && btrfs_token_file_extent_type(leaf, fi, &token) == + BTRFS_FILE_EXTENT_INLINE) { + ret = btrfs_del_item(trans, log, path); + btrfs_release_path(path); + if (ret) { + path->really_keep_locks = 0; + return ret; + } + goto insert; + } + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { @@ -3362,6 +3402,92 @@ static int log_one_extent(struct btrfs_trans_handle *trans, csum_len = block_len; } + /* + * First check and see if our csums are on our outstanding ordered + * extents. + */ +again: + spin_lock_irq(&log->log_extents_lock[index]); + list_for_each_entry(ordered, &log->logged_list[index], log_list) { + struct btrfs_ordered_sum *sum; + + if (!mod_len) + break; + + if (ordered->inode != inode) + continue; + + if (ordered->file_offset + ordered->len <= mod_start || + mod_start + mod_len <= ordered->file_offset) + continue; + + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this + * ordered extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered->file_offset + ordered->len >= + mod_start + mod_len) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered->file_offset + ordered->len < + mod_start + mod_len) { + mod_len = (mod_start + mod_len) - + (ordered->file_offset + ordered->len); + mod_start = ordered->file_offset + + ordered->len; + } else { + mod_len = 0; + } + } + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, + &ordered->flags)) + continue; + atomic_inc(&ordered->refs); + spin_unlock_irq(&log->log_extents_lock[index]); + /* + * we've dropped the lock, we must either break or + * start over after this. + */ + + wait_event(ordered->wait, ordered->csum_bytes_left == 0); + + list_for_each_entry(sum, &ordered->list, list) { + ret = btrfs_csum_file_blocks(trans, log, sum); + if (ret) { + btrfs_put_ordered_extent(ordered); + goto unlocked; + } + } + btrfs_put_ordered_extent(ordered); + goto again; + + } + spin_unlock_irq(&log->log_extents_lock[index]); +unlocked: + + if (!mod_len || ret) + return ret; + + csum_offset = mod_start - em->start; + csum_len = mod_len; + /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, em->block_start + csum_offset, @@ -3393,6 +3519,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; u64 test_gen; int ret = 0; + int num = 0; INIT_LIST_HEAD(&extents); @@ -3401,16 +3528,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, list_for_each_entry_safe(em, n, &tree->modified_extents, list) { list_del_init(&em->list); + + /* + * Just an arbitrary number, this can be really CPU intensive + * once we start getting a lot of extents, and really once we + * have a bunch of extents we just want to commit since it will + * be faster. + */ + if (++num > 32768) { + list_del_init(&tree->modified_extents); + ret = -EFBIG; + goto process; + } + if (em->generation <= test_gen) continue; /* Need a ref to keep it from getting evicted from cache */ atomic_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); list_add_tail(&em->list, &extents); + num++; } list_sort(NULL, &extents, extent_cmp); +process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -3513,6 +3655,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); + btrfs_get_logged_extents(log, inode); + /* * a brute force approach to making sure we get the most uptodate * copies of everything. @@ -3558,7 +3702,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, while (1) { ins_nr = 0; ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, trans->transid); + path, trans->transid); if (ret != 0) break; again: @@ -3656,6 +3800,8 @@ log_extents: BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: + if (err) + btrfs_free_logged_extents(log, log->log_transid); mutex_unlock(&BTRFS_I(inode)->log_mutex); btrfs_free_path(path); @@ -3822,7 +3968,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: dput(old_parent); if (ret < 0) { - WARN_ON(ret != -ENOSPC); root->fs_info->last_trans_log_full_commit = trans->transid; ret = 1; } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 99be4c138db6..ddc61cad0080 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -5,7 +5,7 @@ */ #include <linux/slab.h> -#include <linux/module.h> +#include <linux/export.h> #include "ulist.h" /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 15f6efdf6463..2854c824ab64 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -25,6 +25,8 @@ #include <linux/capability.h> #include <linux/ratelimit.h> #include <linux/kthread.h> +#include <linux/raid/pq.h> +#include <asm/div64.h> #include "compat.h" #include "ctree.h" #include "extent_map.h" @@ -32,6 +34,7 @@ #include "transaction.h" #include "print-tree.h" #include "volumes.h" +#include "raid56.h" #include "async-thread.h" #include "check-integrity.h" #include "rcu-string.h" @@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) new_device->writeable = 0; new_device->in_fs_metadata = 0; new_device->can_discard = 0; + spin_lock_init(&new_device->io_lock); list_replace_rcu(&device->dev_list, &new_device->dev_list); call_rcu(&device->rcu, free_device); @@ -680,6 +684,12 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) __btrfs_close_devices(fs_devices); free_fs_devices(fs_devices); } + /* + * Wait for rcu kworkers under __btrfs_close_devices + * to finish all blkdev_puts so device is really + * free when umount is done. + */ + rcu_barrier(); return ret; } @@ -792,26 +802,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } +/* + * Look for a btrfs signature on a device. This may be called out of the mount path + * and we are not allowed to call set_blocksize during the scan. The superblock + * is read via pagecache + */ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret) { struct btrfs_super_block *disk_super; struct block_device *bdev; - struct buffer_head *bh; - int ret; + struct page *page; + void *p; + int ret = -EINVAL; u64 devid; u64 transid; u64 total_devices; + u64 bytenr; + pgoff_t index; + /* + * we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + bytenr = btrfs_sb_offset(0); flags |= FMODE_EXCL; mutex_lock(&uuid_mutex); - ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); - if (ret) + + bdev = blkdev_get_by_path(path, flags, holder); + + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); goto error; - disk_super = (struct btrfs_super_block *)bh->b_data; + } + + /* make sure our super fits in the device */ + if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) + goto error_bdev_put; + + /* make sure our super fits in the page */ + if (sizeof(*disk_super) > PAGE_CACHE_SIZE) + goto error_bdev_put; + + /* make sure our super doesn't straddle pages on disk */ + index = bytenr >> PAGE_CACHE_SHIFT; + if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) + goto error_bdev_put; + + /* pull in the page with our super */ + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + index, GFP_NOFS); + + if (IS_ERR_OR_NULL(page)) + goto error_bdev_put; + + p = kmap(page); + + /* align our pointer to the offset of the super block */ + disk_super = p + (bytenr & ~PAGE_CACHE_MASK); + + if (btrfs_super_bytenr(disk_super) != bytenr || + disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) + goto error_unmap; + devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); + if (disk_super->label[0]) { if (disk_super->label[BTRFS_LABEL_SIZE - 1]) disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; @@ -819,12 +878,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, } else { printk(KERN_INFO "device fsid %pU ", disk_super->fsid); } + printk(KERN_CONT "devid %llu transid %llu %s\n", (unsigned long long)devid, (unsigned long long)transid, path); + ret = device_list_add(path, disk_super, devid, fs_devices_ret); if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; - brelse(bh); + +error_unmap: + kunmap(page); + page_cache_release(page); + +error_bdev_put: blkdev_put(bdev, flags); error: mutex_unlock(&uuid_mutex); @@ -1372,14 +1438,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) u64 devid; u64 num_devices; u8 *dev_uuid; + unsigned seq; int ret = 0; bool clear_super = false; mutex_lock(&uuid_mutex); - all_avail = root->fs_info->avail_data_alloc_bits | - root->fs_info->avail_system_alloc_bits | - root->fs_info->avail_metadata_alloc_bits; + do { + seq = read_seqbegin(&root->fs_info->profiles_lock); + + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); num_devices = root->fs_info->fs_devices->num_devices; btrfs_dev_replace_lock(&root->fs_info->dev_replace); @@ -1403,6 +1474,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && + root->fs_info->fs_devices->rw_devices <= 2) { + printk(KERN_ERR "btrfs: unable to go below two " + "devices on raid5\n"); + ret = -EINVAL; + goto out; + } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && + root->fs_info->fs_devices->rw_devices <= 3) { + printk(KERN_ERR "btrfs: unable to go below three " + "devices on raid6\n"); + ret = -EINVAL; + goto out; + } + if (strcmp(device_path, "missing") == 0) { struct list_head *devices; struct btrfs_device *tmp; @@ -1556,7 +1642,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = 0; /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + if (bdev) + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); error_brelse: brelse(bh); @@ -2298,7 +2385,11 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, return ret; trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_std_error(root->fs_info, ret); + return ret; + } lock_chunks(root); @@ -2615,7 +2706,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, chunk_used = btrfs_block_group_used(&cache->item); if (bargs->usage == 0) - user_thresh = 0; + user_thresh = 1; else if (bargs->usage > 100) user_thresh = cache->key.offset; else @@ -2663,11 +2754,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, return 0; if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; - factor = num_stripes / factor; + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { + factor = num_stripes / 2; + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { + factor = num_stripes - 1; + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { + factor = num_stripes - 2; + } else { + factor = num_stripes; + } for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); @@ -2965,7 +3060,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); - BUG_ON(ret); + if (ret) + btrfs_std_error(fs_info, ret); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } @@ -2984,6 +3080,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, int mixed = 0; int ret; u64 num_devices; + unsigned seq; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -3026,7 +3123,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); else allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10); + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6); if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->data.target, 1) || @@ -3066,23 +3165,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl, /* allow to reduce meta or sys integrity only if force set */ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10; - if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_system_alloc_bits & allowed) && - !(bctl->sys.target & allowed)) || - ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - printk(KERN_INFO "btrfs: force reducing metadata " - "integrity\n"); - } else { - printk(KERN_ERR "btrfs: balance will reduce metadata " - "integrity, use force if you want this\n"); - ret = -EINVAL; - goto out; + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6; + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + printk(KERN_INFO "btrfs: force reducing metadata " + "integrity\n"); + } else { + printk(KERN_ERR "btrfs: balance will reduce metadata " + "integrity, use force if you want this\n"); + ret = -EINVAL; + goto out; + } } - } + } while (read_seqretry(&fs_info->profiles_lock, seq)); if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { int num_tolerated_disk_barrier_failures; @@ -3126,6 +3231,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl, mutex_lock(&fs_info->balance_mutex); atomic_dec(&fs_info->balance_running); + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + } + if (bargs) { memset(bargs, 0, sizeof(*bargs)); update_ioctl_balance_args(fs_info, 0, bargs); @@ -3136,11 +3246,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, __cancel_balance(fs_info); } - if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - fs_info->num_tolerated_disk_barrier_failures = - btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); - } - wake_up(&fs_info->balance_wait_q); return ret; @@ -3503,13 +3608,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b) } struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { - { 2, 1, 0, 4, 2, 2 /* raid10 */ }, - { 1, 1, 2, 2, 2, 2 /* raid1 */ }, - { 1, 2, 1, 1, 1, 2 /* dup */ }, - { 1, 1, 0, 2, 1, 1 /* raid0 */ }, - { 1, 1, 1, 1, 1, 1 /* single */ }, + [BTRFS_RAID_RAID10] = { + .sub_stripes = 2, + .dev_stripes = 1, + .devs_max = 0, /* 0 == as many as possible */ + .devs_min = 4, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_RAID1] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 2, + .devs_min = 2, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_DUP] = { + .sub_stripes = 1, + .dev_stripes = 2, + .devs_max = 1, + .devs_min = 1, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID0] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_SINGLE] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 1, + .devs_min = 1, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_RAID5] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID6] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 3, + .devs_increment = 1, + .ncopies = 3, + }, }; +static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) +{ + /* TODO allow them to set a preferred stripe size */ + return 64 * 1024; +} + +static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) +{ + u64 features; + + if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) + return; + + features = btrfs_super_incompat_flags(info->super_copy); + if (features & BTRFS_FEATURE_INCOMPAT_RAID56) + return; + + features |= BTRFS_FEATURE_INCOMPAT_RAID56; + btrfs_set_super_incompat_flags(info->super_copy, features); + printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); +} + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, struct map_lookup **map_ret, @@ -3525,6 +3703,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_device_info *devices_info = NULL; u64 total_avail; int num_stripes; /* total number of stripes to allocate */ + int data_stripes; /* number of stripes that count for + block group size */ int sub_stripes; /* sub_stripes info for map */ int dev_stripes; /* stripes per dev */ int devs_max; /* max devs to use */ @@ -3536,6 +3716,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 max_chunk_size; u64 stripe_size; u64 num_bytes; + u64 raid_stripe_len = BTRFS_STRIPE_LEN; int ndevs; int i; int j; @@ -3630,12 +3811,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) continue; + if (ndevs == fs_devices->rw_devices) { + WARN(1, "%s: found more than %llu devices\n", + __func__, fs_devices->rw_devices); + break; + } devices_info[ndevs].dev_offset = dev_offset; devices_info[ndevs].max_avail = max_avail; devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; ++ndevs; - WARN_ON(ndevs > fs_devices->rw_devices); } /* @@ -3661,16 +3846,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = devices_info[ndevs-1].max_avail; num_stripes = ndevs * dev_stripes; - if (stripe_size * ndevs > max_chunk_size * ncopies) { - stripe_size = max_chunk_size * ncopies; - do_div(stripe_size, ndevs); + /* + * this will have to be fixed for RAID1 and RAID10 over + * more drives + */ + data_stripes = num_stripes / ncopies; + + if (type & BTRFS_BLOCK_GROUP_RAID5) { + raid_stripe_len = find_raid56_stripe_len(ndevs - 1, + btrfs_super_stripesize(info->super_copy)); + data_stripes = num_stripes - 1; + } + if (type & BTRFS_BLOCK_GROUP_RAID6) { + raid_stripe_len = find_raid56_stripe_len(ndevs - 2, + btrfs_super_stripesize(info->super_copy)); + data_stripes = num_stripes - 2; + } + + /* + * Use the number of data stripes to figure out how big this chunk + * is really going to be in terms of logical address space, + * and compare that answer with the max chunk size + */ + if (stripe_size * data_stripes > max_chunk_size) { + u64 mask = (1ULL << 24) - 1; + stripe_size = max_chunk_size; + do_div(stripe_size, data_stripes); + + /* bump the answer up to a 16MB boundary */ + stripe_size = (stripe_size + mask) & ~mask; + + /* but don't go higher than the limits we found + * while searching for free extents + */ + if (stripe_size > devices_info[ndevs-1].max_avail) + stripe_size = devices_info[ndevs-1].max_avail; } do_div(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - do_div(stripe_size, BTRFS_STRIPE_LEN); - stripe_size *= BTRFS_STRIPE_LEN; + do_div(stripe_size, raid_stripe_len); + stripe_size *= raid_stripe_len; map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); if (!map) { @@ -3688,14 +3905,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } } map->sector_size = extent_root->sectorsize; - map->stripe_len = BTRFS_STRIPE_LEN; - map->io_align = BTRFS_STRIPE_LEN; - map->io_width = BTRFS_STRIPE_LEN; + map->stripe_len = raid_stripe_len; + map->io_align = raid_stripe_len; + map->io_width = raid_stripe_len; map->type = type; map->sub_stripes = sub_stripes; *map_ret = map; - num_bytes = stripe_size * (num_stripes / ncopies); + num_bytes = stripe_size * data_stripes; *stripe_size_out = stripe_size; *num_bytes_out = num_bytes; @@ -3717,15 +3934,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - free_extent_map(em); - if (ret) - goto error; - - ret = btrfs_make_block_group(trans, extent_root, 0, type, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, - start, num_bytes); - if (ret) + if (ret) { + free_extent_map(em); goto error; + } for (i = 0; i < map->num_stripes; ++i) { struct btrfs_device *device; @@ -3738,15 +3950,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, info->chunk_root->root_key.objectid, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, dev_offset, stripe_size); - if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); - goto error; - } + if (ret) + goto error_dev_extent; + } + + ret = btrfs_make_block_group(trans, extent_root, 0, type, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, num_bytes); + if (ret) { + i = map->num_stripes - 1; + goto error_dev_extent; } + free_extent_map(em); + check_raid56_incompat_flag(extent_root->fs_info, type); + kfree(devices_info); return 0; +error_dev_extent: + for (; i >= 0; i--) { + struct btrfs_device *device; + int err; + + device = map->stripes[i].dev; + err = btrfs_free_dev_extent(trans, device, start); + if (err) { + btrfs_abort_transaction(trans, extent_root, err); + break; + } + } + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + /* One for our allocation */ + free_extent_map(em); + /* One for the tree reference */ + free_extent_map(em); error: kfree(map); kfree(devices_info); @@ -3886,10 +4127,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, if (ret) return ret; - alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - fs_info->avail_metadata_alloc_bits; - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - + alloc_profile = btrfs_get_alloc_profile(extent_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, &stripe_size, chunk_offset, alloc_profile); if (ret) @@ -3897,10 +4135,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, sys_chunk_offset = chunk_offset + chunk_size; - alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - fs_info->avail_system_alloc_bits; - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - + alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, &sys_chunk_size, &sys_stripe_size, sys_chunk_offset, alloc_profile); @@ -4013,6 +4248,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) + ret = 2; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + ret = 3; else ret = 1; free_extent_map(em); @@ -4025,6 +4264,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return ret; } +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, + struct btrfs_mapping_tree *map_tree, + u64 logical) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + unsigned long len = root->sectorsize; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + len = map->stripe_len * nr_data_stripes(map); + } + free_extent_map(em); + return len; +} + +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 len, int mirror_num) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) + ret = 1; + free_extent_map(em); + return ret; +} + static int find_live_mirror(struct btrfs_fs_info *fs_info, struct map_lookup *map, int first, int num, int optimal, int dev_replace_is_ongoing) @@ -4062,10 +4347,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return optimal; } +static inline int parity_smaller(u64 a, u64 b) +{ + return a > b; +} + +/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ +static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) +{ + struct btrfs_bio_stripe s; + int i; + u64 l; + int again = 1; + + while (again) { + again = 0; + for (i = 0; i < bbio->num_stripes - 1; i++) { + if (parity_smaller(raid_map[i], raid_map[i+1])) { + s = bbio->stripes[i]; + l = raid_map[i]; + bbio->stripes[i] = bbio->stripes[i+1]; + raid_map[i] = raid_map[i+1]; + bbio->stripes[i+1] = s; + raid_map[i+1] = l; + again = 1; + } + } + } +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, - int mirror_num) + int mirror_num, u64 **raid_map_ret) { struct extent_map *em; struct map_lookup *map; @@ -4077,6 +4391,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 stripe_nr; u64 stripe_nr_orig; u64 stripe_nr_end; + u64 stripe_len; + u64 *raid_map = NULL; int stripe_index; int i; int ret = 0; @@ -4088,6 +4404,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, int num_alloc_stripes; int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; + u64 raid56_full_stripe_start = (u64)-1; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); @@ -4104,29 +4421,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, map = (struct map_lookup *)em->bdev; offset = logical - em->start; + if (mirror_num > map->num_stripes) + mirror_num = 0; + + stripe_len = map->stripe_len; stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride * to get to this block */ - do_div(stripe_nr, map->stripe_len); + do_div(stripe_nr, stripe_len); - stripe_offset = stripe_nr * map->stripe_len; + stripe_offset = stripe_nr * stripe_len; BUG_ON(offset < stripe_offset); /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; - if (rw & REQ_DISCARD) + /* if we're here for raid56, we need to know the stripe aligned start */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); + raid56_full_stripe_start = offset; + + /* allow a write of a full stripe, but make sure we don't + * allow straddling of stripes + */ + do_div(raid56_full_stripe_start, full_stripe_len); + raid56_full_stripe_start *= full_stripe_len; + } + + if (rw & REQ_DISCARD) { + /* we don't discard raid56 yet */ + if (map->type & + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + ret = -EOPNOTSUPP; + goto out; + } *length = min_t(u64, em->len - offset, *length); - else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - /* we limit the length of each bio to what fits in a stripe */ - *length = min_t(u64, em->len - offset, - map->stripe_len - stripe_offset); + } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + u64 max_len; + /* For writes to RAID[56], allow a full stripeset across all disks. + For other RAID types and for RAID[56] reads, just allow a single + stripe (on a single disk). */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && + (rw & REQ_WRITE)) { + max_len = stripe_len * nr_data_stripes(map) - + (offset - raid56_full_stripe_start); + } else { + /* we limit the length of each bio to what fits in a stripe */ + max_len = stripe_len - stripe_offset; + } + *length = min_t(u64, em->len - offset, max_len); } else { *length = em->len - offset; } + /* This is for when we're called from btrfs_merge_bio_hook() and all + it cares about is the length */ if (!bbio_ret) goto out; @@ -4159,7 +4510,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 physical_of_found = 0; ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, - logical, &tmp_length, &tmp_bbio, 0); + logical, &tmp_length, &tmp_bbio, 0, NULL); if (ret) { WARN_ON(tmp_bbio != NULL); goto out; @@ -4220,11 +4571,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_stripes = 1; stripe_index = 0; stripe_nr_orig = stripe_nr; - stripe_nr_end = (offset + *length + map->stripe_len - 1) & - (~(map->stripe_len - 1)); + stripe_nr_end = ALIGN(offset + *length, map->stripe_len); do_div(stripe_nr_end, map->stripe_len); stripe_end_offset = stripe_nr_end * map->stripe_len - (offset + *length); + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { if (rw & REQ_DISCARD) num_stripes = min_t(u64, map->num_stripes, @@ -4275,6 +4626,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, dev_replace_is_ongoing); mirror_num = stripe_index - old_stripe_index + 1; } + + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + u64 tmp; + + if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) + && raid_map_ret) { + int i, rot; + + /* push stripe_nr back to the start of the full stripe */ + stripe_nr = raid56_full_stripe_start; + do_div(stripe_nr, stripe_len); + + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + + /* RAID[56] write or recovery. Return all stripes */ + num_stripes = map->num_stripes; + max_errors = nr_parity_stripes(map); + + raid_map = kmalloc(sizeof(u64) * num_stripes, + GFP_NOFS); + if (!raid_map) { + ret = -ENOMEM; + goto out; + } + + /* Work out the disk rotation on this stripe-set */ + tmp = stripe_nr; + rot = do_div(tmp, num_stripes); + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + for (i = 0; i < nr_data_stripes(map); i++) + raid_map[(i+rot) % num_stripes] = + em->start + (tmp + i) * map->stripe_len; + + raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + raid_map[(i+rot+1) % num_stripes] = + RAID6_Q_STRIPE; + + *length = map->stripe_len; + stripe_index = 0; + stripe_offset = 0; + } else { + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + tmp = stripe_nr + stripe_index; + stripe_index = do_div(tmp, map->num_stripes); + } } else { /* * after this do_div call, stripe_nr is the number of stripes @@ -4383,8 +4793,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_DUP)) { max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { + max_errors = 2; } } @@ -4485,6 +4898,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio->stripes[0].physical = physical_to_patch_in_first_stripe; bbio->mirror_num = map->num_stripes + 1; } + if (raid_map) { + sort_parity_stripes(bbio, raid_map); + *raid_map_ret = raid_map; + } out: if (dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); @@ -4497,7 +4914,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, struct btrfs_bio **bbio_ret, int mirror_num) { return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, - mirror_num); + mirror_num, NULL); } int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, @@ -4511,20 +4928,39 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 bytenr; u64 length; u64 stripe_nr; + u64 rmap_len; int i, j, nr = 0; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_start, 1); read_unlock(&em_tree->lock); - BUG_ON(!em || em->start != chunk_start); + if (!em) { + printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", + chunk_start); + return -EIO; + } + + if (em->start != chunk_start) { + printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", + em->start, chunk_start); + free_extent_map(em); + return -EIO; + } map = (struct map_lookup *)em->bdev; length = em->len; + rmap_len = map->stripe_len; + if (map->type & BTRFS_BLOCK_GROUP_RAID10) do_div(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) do_div(length, map->num_stripes); + else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + do_div(length, nr_data_stripes(map)); + rmap_len = map->stripe_len * nr_data_stripes(map); + } buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); BUG_ON(!buf); /* -ENOMEM */ @@ -4544,8 +4980,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(stripe_nr, map->sub_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; - } - bytenr = chunk_start + stripe_nr * map->stripe_len; + } /* else if RAID[56], multiply by nr_data_stripes(). + * Alternatively, just use rmap_len below instead of + * map->stripe_len */ + + bytenr = chunk_start + stripe_nr * rmap_len; WARN_ON(nr >= map->num_stripes); for (j = 0; j < nr; j++) { if (buf[j] == bytenr) @@ -4559,7 +4998,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, *logical = buf; *naddrs = nr; - *stripe_len = map->stripe_len; + *stripe_len = rmap_len; free_extent_map(em); return 0; @@ -4633,7 +5072,7 @@ static void btrfs_end_bio(struct bio *bio, int err) bio->bi_bdev = (struct block_device *) (unsigned long)bbio->mirror_num; /* only send an error to the higher layers if it is - * beyond the tolerance of the multi-bio + * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { err = -EIO; @@ -4667,13 +5106,18 @@ struct async_sched { * This will add one bio to the pending list for a device and make sure * the work struct is scheduled. */ -static noinline void schedule_bio(struct btrfs_root *root, +noinline void btrfs_schedule_bio(struct btrfs_root *root, struct btrfs_device *device, int rw, struct bio *bio) { int should_queue = 1; struct btrfs_pending_bios *pending_bios; + if (device->missing || !device->bdev) { + bio_endio(bio, -EIO); + return; + } + /* don't bother with additional async steps for reads, right now */ if (!(rw & REQ_WRITE)) { bio_get(bio); @@ -4771,7 +5215,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, #endif bio->bi_bdev = dev->bdev; if (async) - schedule_bio(root, dev, rw, bio); + btrfs_schedule_bio(root, dev, rw, bio); else btrfsic_submit_bio(rw, bio); } @@ -4830,6 +5274,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; + u64 *raid_map = NULL; int ret; int dev_nr = 0; int total_devs = 1; @@ -4838,12 +5283,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, length = bio->bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, - mirror_num); - if (ret) + ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, + mirror_num, &raid_map); + if (ret) /* -ENOMEM */ return ret; total_devs = bbio->num_stripes; + bbio->orig_bio = first_bio; + bbio->private = first_bio->bi_private; + bbio->end_io = first_bio->bi_end_io; + atomic_set(&bbio->stripes_pending, bbio->num_stripes); + + if (raid_map) { + /* In this case, map_length has been set to the length of + a single stripe; not the whole write */ + if (rw & WRITE) { + return raid56_parity_write(root, bio, bbio, + raid_map, map_length); + } else { + return raid56_parity_recover(root, bio, bbio, + raid_map, map_length, + mirror_num); + } + } + if (map_length < length) { printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " "len %llu\n", (unsigned long long)logical, @@ -4852,11 +5315,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, BUG(); } - bbio->orig_bio = first_bio; - bbio->private = first_bio->bi_private; - bbio->end_io = first_bio->bi_end_io; - atomic_set(&bbio->stripes_pending, bbio->num_stripes); - while (dev_nr < total_devs) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d3c3939ac751..062d8604d35b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -21,8 +21,8 @@ #include <linux/bio.h> #include <linux/sort.h> +#include <linux/btrfs.h> #include "async-thread.h" -#include "ioctl.h" #define BTRFS_STRIPE_LEN (64 * 1024) @@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); int btrfs_scratch_superblock(struct btrfs_device *device); - +void btrfs_schedule_bio(struct btrfs_root *root, + struct btrfs_device *device, + int rw, struct bio *bio); +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 len, int mirror_num); +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, + struct btrfs_mapping_tree *map_tree, + u64 logical); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) { diff --git a/fs/buffer.c b/fs/buffer.c index 7a75c3e0fd58..b4dcb34c9635 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,6 +41,7 @@ #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> +#include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -53,6 +54,13 @@ void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) } EXPORT_SYMBOL(init_buffer); +inline void touch_buffer(struct buffer_head *bh) +{ + trace_block_touch_buffer(bh); + mark_page_accessed(bh->b_page); +} +EXPORT_SYMBOL(touch_buffer); + static int sleep_on_buffer(void *word) { io_schedule(); @@ -1113,6 +1121,8 @@ void mark_buffer_dirty(struct buffer_head *bh) { WARN_ON_ONCE(!buffer_uptodate(bh)); + trace_block_dirty_buffer(bh); + /* * Very *carefully* optimize the it-is-already-dirty case. * @@ -2332,7 +2342,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { struct page *page = vmf->page; - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); unsigned long end; loff_t size; int ret; @@ -2359,7 +2369,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (unlikely(ret < 0)) goto out_unlock; set_page_dirty(page); - wait_on_page_writeback(page); + wait_for_stable_page(page); return 0; out_unlock: unlock_page(page); @@ -2371,7 +2381,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { int ret; - struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; sb_start_pagefault(sb); @@ -3227,7 +3237,7 @@ static struct kmem_cache *bh_cachep __read_mostly; * Once the number of bh's in the machine exceeds this level, we start * stripping them in writeback. */ -static int max_buffer_heads; +static unsigned long max_buffer_heads; int buffer_heads_over_limit; @@ -3343,7 +3353,7 @@ EXPORT_SYMBOL(bh_submit_read); void __init buffer_init(void) { - int nrpages; + unsigned long nrpages; bh_cachep = kmem_cache_create("buffer_head", sizeof(struct buffer_head), 0, diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 9eb134ea6eb2..49bc78243db9 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -1,6 +1,6 @@ config CEPH_FS - tristate "Ceph distributed file system (EXPERIMENTAL)" - depends on INET && EXPERIMENTAL + tristate "Ceph distributed file system" + depends on INET select CEPH_LIB select LIBCRC32C select CRYPTO_AES diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 064d1a68d2c1..a60ea977af6f 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -195,7 +195,7 @@ static int ceph_releasepage(struct page *page, gfp_t g) */ static int readpage_nounlock(struct file *filp, struct page *page) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; @@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page) static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; - struct ceph_osd_reply_head *replyhead; - int rc, bytes; + int rc = req->r_result; + int bytes = le32_to_cpu(msg->hdr.data_len); int i; - /* parse reply */ - replyhead = msg->front.iov_base; - WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); - rc = le32_to_cpu(replyhead->result); - bytes = le32_to_cpu(msg->hdr.data_len); - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ @@ -315,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, ci->i_truncate_seq, ci->i_truncate_size, - NULL, false, 1, 0); + NULL, false, 0); if (IS_ERR(req)) return PTR_ERR(req); @@ -370,7 +364,7 @@ out: static int ceph_readpages(struct file *file, struct address_space *mapping, struct list_head *page_list, unsigned nr_pages) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int rc = 0; int max = 0; @@ -492,8 +486,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) &ci->i_layout, snapc, page_off, len, ci->i_truncate_seq, ci->i_truncate_size, - &inode->i_mtime, - &page, 1, 0, 0, true); + &inode->i_mtime, &page, 1); if (err < 0) { dout("writepage setting page/mapping error %d %p\n", err, page); SetPageError(page); @@ -554,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; - struct ceph_osd_reply_head *replyhead; - struct ceph_osd_op *op; struct ceph_inode_info *ci = ceph_inode(inode); unsigned wrote; struct page *page; int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; - __s32 rc = -EIO; - u64 bytes = 0; + int rc = req->r_result; + u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); - /* parse reply */ - replyhead = msg->front.iov_base; - WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); - op = (void *)(replyhead + 1); - rc = le32_to_cpu(replyhead->result); - bytes = le64_to_cpu(op->extent.length); - if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -741,8 +725,6 @@ retry: struct page *page; int want; u64 offset, len; - struct ceph_osd_request_head *reqhead; - struct ceph_osd_op *op; long writeback_stat; next = 0; @@ -838,7 +820,7 @@ get_more_pages: snapc, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &inode->i_mtime, true, 1, 0); + &inode->i_mtime, true, 0); if (IS_ERR(req)) { rc = PTR_ERR(req); @@ -906,10 +888,8 @@ get_more_pages: /* revise final length, page count */ req->r_num_pages = locked_pages; - reqhead = req->r_request->front.iov_base; - op = (void *)(reqhead + 1); - op->extent.length = cpu_to_le64(len); - op->payload_len = cpu_to_le32(len); + req->r_request_ops[0].extent.length = cpu_to_le64(len); + req->r_request_ops[0].payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); @@ -977,7 +957,7 @@ static int ceph_update_writeable_page(struct file *file, loff_t pos, unsigned len, struct page *page) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t page_off = pos & PAGE_CACHE_MASK; @@ -1086,7 +1066,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = file->private_data; struct page *page; @@ -1144,7 +1124,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -1228,7 +1208,7 @@ const struct address_space_operations ceph_aops = { */ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); struct page *page = vmf->page; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t off = page_offset(page); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a1d9bb30c1bf..78e2f575247d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -611,8 +611,16 @@ retry: if (flags & CEPH_CAP_FLAG_AUTH) ci->i_auth_cap = cap; - else if (ci->i_auth_cap == cap) + else if (ci->i_auth_cap == cap) { ci->i_auth_cap = NULL; + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p to cap_dirty_migrating\n", inode); + list_move(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); + } + spin_unlock(&mdsc->cap_dirty_lock); + } dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", inode, ceph_vinop(inode), cap, ceph_cap_string(issued), @@ -930,7 +938,7 @@ static int send_cap_msg(struct ceph_mds_session *session, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, - uid_t uid, gid_t gid, umode_t mode, + kuid_t uid, kgid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, u64 follows) @@ -974,8 +982,8 @@ static int send_cap_msg(struct ceph_mds_session *session, ceph_encode_timespec(&fc->atime, atime); fc->time_warp_seq = cpu_to_le32(time_warp_seq); - fc->uid = cpu_to_le32(uid); - fc->gid = cpu_to_le32(gid); + fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); + fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); fc->mode = cpu_to_le32(mode); fc->xattr_version = cpu_to_le64(xattr_version); @@ -1081,8 +1089,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, struct timespec mtime, atime; int wake = 0; umode_t mode; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; struct ceph_mds_session *session; u64 xattr_version = 0; struct ceph_buffer *xattr_blob = NULL; @@ -1460,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; - int file_wanted, used; + int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list @@ -1563,9 +1571,14 @@ retry_locked: /* NOTE: no side-effects allowed, until we take s_mutex */ + cap_used = used; + if (ci->i_auth_cap && cap != ci->i_auth_cap) + cap_used &= ~ci->i_auth_cap->issued; + revoking = cap->implemented & ~cap->issued; - dout(" mds%d cap %p issued %s implemented %s revoking %s\n", + dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", cap->mds, cap, ceph_cap_string(cap->issued), + ceph_cap_string(cap_used), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); @@ -1593,7 +1606,7 @@ retry_locked: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & used) == 0) { + if (revoking && (revoking & cap_used) == 0) { dout("completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; @@ -1670,8 +1683,8 @@ ack: sent++; /* __send_cap drops i_ceph_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, - retain, flushing, NULL); + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, + want, retain, flushing, NULL); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -2359,10 +2372,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(grant->mode); - inode->i_uid = le32_to_cpu(grant->uid); - inode->i_gid = le32_to_cpu(grant->gid); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - inode->i_uid, inode->i_gid); + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); } if ((issued & CEPH_CAP_LINK_EXCL) == 0) @@ -2416,7 +2430,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, dout("mds wanted %s -> %s\n", ceph_cap_string(le32_to_cpu(grant->wanted)), ceph_cap_string(wanted)); - grant->wanted = cpu_to_le32(wanted); + /* imported cap may not have correct mds_wanted */ + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) + check_caps = 1; } cap->seq = seq; @@ -2820,6 +2836,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); + if (op == CEPH_CAP_OP_IMPORT) + ceph_add_cap_releases(mdsc, session); + /* lookup ino */ inode = ceph_find_inode(sb, vino); ci = ceph_inode(inode); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8c1aabe93b67..6d797f46d772 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -238,7 +238,7 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name, static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) { struct ceph_file_info *fi = filp->private_data; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -1138,7 +1138,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct ceph_file_info *cf = file->private_data; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); int left; const int bufsize = 1024; @@ -1188,7 +1188,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct list_head *head = &ci->i_unsafe_dirops; struct ceph_mds_request *req; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index ca3ab3f9ca70..16796be53ca5 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -81,7 +81,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, if (parent_inode) { /* nfsd wants connectable */ *max_len = connected_handle_length; - type = 255; + type = FILEID_INVALID; } else { dout("encode_fh %p\n", dentry); fh->ino = ceph_ino(inode); @@ -90,7 +90,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, } } else { *max_len = handle_length; - type = 255; + type = FILEID_INVALID; } if (dentry) dput(dentry); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e51558fca3a3..bf338d9b67e3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); + if (err) + goto out_err; + err = ceph_handle_snapdir(req, dentry, err); if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); @@ -263,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = finish_no_open(file, dn); } else { dout("atomic_open finish_open on dn %p\n", dn); + if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { + *opened |= FILE_CREATED; + } err = finish_open(file, dentry, ceph_open, opened); } @@ -393,7 +399,7 @@ more: static ssize_t ceph_sync_read(struct file *file, char __user *data, unsigned len, loff_t *poff, int *checkeof) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct page **pages; u64 off = *poff; int num_pages, ret; @@ -466,7 +472,7 @@ static void sync_write_commit(struct ceph_osd_request *req, static ssize_t ceph_sync_write(struct file *file, const char __user *data, size_t left, loff_t *offset) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; @@ -483,7 +489,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int ret; struct timespec mtime = CURRENT_TIME; - if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP) + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; dout("sync_write on file %p %lld~%u %s\n", file, *offset, @@ -535,7 +541,7 @@ more: ci->i_snap_realm->cached_context, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, 2, page_align); + &mtime, false, page_align); if (IS_ERR(req)) return PTR_ERR(req); @@ -637,7 +643,7 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, struct ceph_file_info *fi = filp->private_data; loff_t *ppos = &iocb->ki_pos; size_t len = iov->iov_len; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); void __user *base = iov->iov_base; ssize_t ret; @@ -707,7 +713,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct ceph_file_info *fi = file->private_data; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2971eaa65cdc..851814d951cd 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -612,10 +612,11 @@ static int fill_inode(struct inode *inode, if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(info->mode); - inode->i_uid = le32_to_cpu(info->uid); - inode->i_gid = le32_to_cpu(info->gid); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - inode->i_uid, inode->i_gid); + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); } if ((issued & CEPH_CAP_LINK_EXCL) == 0) @@ -1130,8 +1131,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, req->r_request_started); dout(" final dn %p\n", dn); i++; - } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || - req->r_op == CEPH_MDS_OP_MKSNAP) { + } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) { struct dentry *dn = req->r_dentry; /* fill out a snapdir LOOKUPSNAP dentry */ @@ -1195,6 +1196,39 @@ done: /* * Prepopulate our cache with readdir results, leases, etc. */ +static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, + struct ceph_mds_session *session) +{ + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + int i, err = 0; + + for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_vino vino; + struct inode *in; + int rc; + + vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); + vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + + in = ceph_get_inode(req->r_dentry->d_sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + dout("new_inode badness got %d\n", err); + continue; + } + rc = fill_inode(in, &rinfo->dir_in[i], NULL, session, + req->r_request_started, -1, + &req->r_caps_reservation); + if (rc < 0) { + pr_err("fill_inode badness on %p got %d\n", in, rc); + err = rc; + continue; + } + } + + return err; +} + int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { @@ -1209,6 +1243,9 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, u64 frag = le32_to_cpu(rhead->args.readdir.frag); struct ceph_dentry_info *di; + if (req->r_aborted) + return readdir_prepopulate_inodes_only(req, session); + if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { snapdir = ceph_get_snapdir(parent->d_inode); parent = d_find_alias(snapdir); @@ -1565,26 +1602,30 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_UID) { dout("setattr %p uid %d -> %d\n", inode, - inode->i_uid, attr->ia_uid); + from_kuid(&init_user_ns, inode->i_uid), + from_kuid(&init_user_ns, attr->ia_uid)); if (issued & CEPH_CAP_AUTH_EXCL) { inode->i_uid = attr->ia_uid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - attr->ia_uid != inode->i_uid) { - req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid); + !uid_eq(attr->ia_uid, inode->i_uid)) { + req->r_args.setattr.uid = cpu_to_le32( + from_kuid(&init_user_ns, attr->ia_uid)); mask |= CEPH_SETATTR_UID; release |= CEPH_CAP_AUTH_SHARED; } } if (ia_valid & ATTR_GID) { dout("setattr %p gid %d -> %d\n", inode, - inode->i_gid, attr->ia_gid); + from_kgid(&init_user_ns, inode->i_gid), + from_kgid(&init_user_ns, attr->ia_gid)); if (issued & CEPH_CAP_AUTH_EXCL) { inode->i_gid = attr->ia_gid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - attr->ia_gid != inode->i_gid) { - req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid); + !gid_eq(attr->ia_gid, inode->i_gid)) { + req->r_args.setattr.gid = cpu_to_le32( + from_kgid(&init_user_ns, attr->ia_gid)); mask |= CEPH_SETATTR_GID; release |= CEPH_CAP_AUTH_SHARED; } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 36549a46e311..4a989345b37b 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -16,11 +16,11 @@ */ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) { - struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); + struct ceph_inode_info *ci = ceph_inode(file_inode(file)); struct ceph_ioctl_layout l; int err; - err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); if (!err) { l.stripe_unit = ceph_file_layout_su(ci->i_layout); l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); @@ -63,12 +63,12 @@ static long __validate_layout(struct ceph_mds_client *mdsc, static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct inode *parent_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; - struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); + struct ceph_inode_info *ci = ceph_inode(file_inode(file)); struct ceph_ioctl_layout nl; int err; @@ -76,7 +76,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return -EFAULT; /* validate changed params against current layout */ - err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); if (err) return err; @@ -136,7 +136,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) */ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_mds_request *req; struct ceph_ioctl_layout l; int err; @@ -179,13 +179,12 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) { struct ceph_ioctl_dataloc dl; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; u64 len = 1, olen; u64 tmp; - struct ceph_object_layout ol; struct ceph_pg pgid; int r; @@ -194,7 +193,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return -EFAULT; down_read(&osdc->map_sem); - r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, + r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, &dl.object_no, &dl.object_offset, &olen); if (r < 0) @@ -209,10 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, + ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout, osdc->osdmap); - pgid = ol.ol_pgid; dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); if (dl.osd >= 0) { struct ceph_entity_addr *a = @@ -234,7 +232,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) static long ceph_ioctl_lazyio(struct file *file) { struct ceph_file_info *fi = file->private_data; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 80576d05d687..202dd3d68be0 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -13,7 +13,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, int cmd, u8 wait, struct file_lock *fl) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9165eb8309eb..442880d099c9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -233,6 +233,30 @@ bad: } /* + * parse create results + */ +static int parse_reply_info_create(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + int features) +{ + if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { + if (*p == end) { + info->has_create_ino = false; + } else { + info->has_create_ino = true; + info->ino = ceph_decode_64(p); + } + } + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + return -EIO; +} + +/* * parse extra results */ static int parse_reply_info_extra(void **p, void *end, @@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end, { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); - else + else if (info->head->op == CEPH_MDS_OP_READDIR) return parse_reply_info_dir(p, end, info, features); + else if (info->head->op == CEPH_MDS_OP_CREATE) + return parse_reply_info_create(p, end, info, features); + else + return -EIO; } /* @@ -1658,8 +1686,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(req->r_uid); - head->caller_gid = cpu_to_le32(req->r_gid); + head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); + head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); @@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&req->r_fill_mutex); err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { - if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && + if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || + req->r_op == CEPH_MDS_OP_LSSNAP) && rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(mdsc, &req->r_caps_reservation); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index dd26846dd71d..c2a19fbbe517 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_info_in *dir_in; u8 dir_complete, dir_end; }; + + /* for create results */ + struct { + bool has_create_ino; + u64 ino; + }; }; /* encoded blob describing snapshot contexts for certain @@ -184,8 +190,8 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ - uid_t r_uid; - gid_t r_gid; + kuid_t r_uid; + kgid_t r_gid; /* for choosing which mds to send this request to */ int r_direct_mode; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 73b7d44e8a35..0d3c9240c61b 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) return ERR_PTR(-ENOMEM); ceph_decode_16_safe(p, end, version, bad); + if (version > 3) { + pr_warning("got mdsmap version %d > 3, failing", version); + goto bad; + } ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); m->m_epoch = ceph_decode_32(p); @@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) /* pg_pools */ ceph_decode_32_safe(p, end, n, bad); m->m_num_data_pg_pools = n; - m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); + m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS); if (!m->m_data_pg_pools) goto badmem; - ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); + ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); for (i = 0; i < n; i++) - m->m_data_pg_pools[i] = ceph_decode_32(p); - m->m_cas_pg_pool = ceph_decode_32(p); + m->m_data_pg_pools[i] = ceph_decode_64(p); + m->m_cas_pg_pool = ceph_decode_64(p); /* ok, we don't care about the rest. */ dout("mdsmap_decode success epoch %u\n", m->m_epoch); diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index cd5097d7c804..89fa4a940a0f 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s) case CEPH_MDS_STATE_BOOT: return "up:boot"; case CEPH_MDS_STATE_STANDBY: return "up:standby"; case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; + case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay"; case CEPH_MDS_STATE_CREATING: return "up:creating"; case CEPH_MDS_STATE_STARTING: return "up:starting"; /* up and in */ @@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUP: return "lookup"; case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; + case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; case CEPH_MDS_OP_GETATTR: return "getattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; case CEPH_MDS_OP_RMXATTR: return "rmxattr"; + case CEPH_MDS_OP_SETLAYOUT: return "setlayou"; + case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout"; case CEPH_MDS_OP_READDIR: return "readdir"; case CEPH_MDS_OP_MKNOD: return "mknod"; case CEPH_MDS_OP_LINK: return "link"; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e86aa9948124..6ddc0bca56b2 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) /* * express utilization in terms of large blocks to avoid * overflow on 32-bit machines. + * + * NOTE: for the time being, we make bsize == frsize to humor + * not-yet-ancient versions of glibc that are broken. + * Someday, we will probably want to report a real block + * size... whatever that may mean for a network file system! */ buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); @@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = le64_to_cpu(st.num_objects); buf->f_ffree = -1; buf->f_namelen = NAME_MAX; - buf->f_frsize = PAGE_CACHE_SIZE; /* leave fsid little-endian, regardless of host endianness */ fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); @@ -947,6 +952,7 @@ static struct file_system_type ceph_fs_type = { .kill_sb = ceph_kill_sb, .fs_flags = FS_RENAME_DOES_D_MOVE, }; +MODULE_ALIAS_FS("ceph"); #define _STRINGIFY(x) #x #define STRINGIFY(x) _STRINGIFY(x) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 66ebe720e40d..c7b309723dcc 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -21,7 +21,7 @@ /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ -#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ +#define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ @@ -138,8 +138,8 @@ struct ceph_cap_snap { struct ceph_snap_context *context; umode_t mode; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; struct ceph_buffer *xattr_blob; u64 xattr_version; @@ -798,13 +798,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); /* file.c */ extern const struct file_operations ceph_file_fops; extern const struct address_space_operations ceph_aops; -extern int ceph_copy_to_page_vector(struct page **pages, - const char *data, - loff_t off, size_t len); -extern int ceph_copy_from_page_vector(struct page **pages, - char *data, - loff_t off, size_t len); -extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); + extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 2c2ae5be9902..9b6b2b6dd164 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -29,9 +29,94 @@ struct ceph_vxattr { size_t name_size; /* strlen(name) + 1 (for '\0') */ size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, size_t size); - bool readonly; + bool readonly, hidden; + bool (*exists_cb)(struct ceph_inode_info *ci); }; +/* layouts */ + +static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) +{ + size_t s; + char *p = (char *)&ci->i_layout; + + for (s = 0; s < sizeof(ci->i_layout); s++, p++) + if (*p) + return true; + return false; +} + +static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, + size_t size) +{ + int ret; + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_osd_client *osdc = &fsc->client->osdc; + s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + const char *pool_name; + + dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); + down_read(&osdc->map_sem); + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); + if (pool_name) + ret = snprintf(val, size, + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s", + (unsigned long long)ceph_file_layout_su(ci->i_layout), + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), + (unsigned long long)ceph_file_layout_object_size(ci->i_layout), + pool_name); + else + ret = snprintf(val, size, + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", + (unsigned long long)ceph_file_layout_su(ci->i_layout), + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), + (unsigned long long)ceph_file_layout_object_size(ci->i_layout), + (unsigned long long)pool); + + up_read(&osdc->map_sem); + return ret; +} + +static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_su(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, + char *val, size_t size) +{ + int ret; + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_osd_client *osdc = &fsc->client->osdc; + s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + const char *pool_name; + + down_read(&osdc->map_sem); + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); + if (pool_name) + ret = snprintf(val, size, "%s", pool_name); + else + ret = snprintf(val, size, "%lld", (unsigned long long)pool); + up_read(&osdc->map_sem); + return ret; +} + /* directories */ static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, @@ -83,17 +168,43 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, (long)ci->i_rctime.tv_nsec); } -#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name -#define XATTR_NAME_CEPH(_type, _name) \ - { \ - .name = CEPH_XATTR_NAME(_type, _name), \ - .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ - .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ - .readonly = true, \ - } +#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name +#define CEPH_XATTR_NAME2(_type, _name, _name2) \ + XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 + +#define XATTR_NAME_CEPH(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .readonly = true, \ + .hidden = false, \ + .exists_cb = NULL, \ + } +#define XATTR_LAYOUT_FIELD(_type, _name, _field) \ + { \ + .name = CEPH_XATTR_NAME2(_type, _name, _field), \ + .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \ + .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \ + .readonly = false, \ + .hidden = true, \ + .exists_cb = ceph_vxattrcb_layout_exists, \ + } static struct ceph_vxattr ceph_dir_vxattrs[] = { + { + .name = "ceph.dir.layout", + .name_size = sizeof("ceph.dir.layout"), + .getxattr_cb = ceph_vxattrcb_layout, + .readonly = false, + .hidden = false, + .exists_cb = ceph_vxattrcb_layout_exists, + }, + XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), + XATTR_LAYOUT_FIELD(dir, layout, stripe_count), + XATTR_LAYOUT_FIELD(dir, layout, object_size), + XATTR_LAYOUT_FIELD(dir, layout, pool), XATTR_NAME_CEPH(dir, entries), XATTR_NAME_CEPH(dir, files), XATTR_NAME_CEPH(dir, subdirs), @@ -102,35 +213,26 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_NAME_CEPH(dir, rsubdirs), XATTR_NAME_CEPH(dir, rbytes), XATTR_NAME_CEPH(dir, rctime), - { 0 } /* Required table terminator */ + { .name = NULL, 0 } /* Required table terminator */ }; static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ /* files */ -static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val, - size_t size) -{ - int ret; - - ret = snprintf(val, size, - "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n", - (unsigned long long)ceph_file_layout_su(ci->i_layout), - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); - return ret; -} - static struct ceph_vxattr ceph_file_vxattrs[] = { - XATTR_NAME_CEPH(file, layout), - /* The following extended attribute name is deprecated */ { - .name = XATTR_CEPH_PREFIX "layout", - .name_size = sizeof (XATTR_CEPH_PREFIX "layout"), - .getxattr_cb = ceph_vxattrcb_file_layout, - .readonly = true, + .name = "ceph.file.layout", + .name_size = sizeof("ceph.file.layout"), + .getxattr_cb = ceph_vxattrcb_layout, + .readonly = false, + .hidden = false, + .exists_cb = ceph_vxattrcb_layout_exists, }, - { 0 } /* Required table terminator */ + XATTR_LAYOUT_FIELD(file, layout, stripe_unit), + XATTR_LAYOUT_FIELD(file, layout, stripe_count), + XATTR_LAYOUT_FIELD(file, layout, object_size), + XATTR_LAYOUT_FIELD(file, layout, pool), + { .name = NULL, 0 } /* Required table terminator */ }; static size_t ceph_file_vxattrs_name_size; /* total size of all names */ @@ -164,7 +266,8 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) size_t size = 0; for (vxattr = vxattrs; vxattr->name; vxattr++) - size += vxattr->name_size; + if (!vxattr->hidden) + size += vxattr->name_size; return size; } @@ -572,13 +675,17 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (!ceph_is_valid_xattr(name)) return -ENODATA; - /* let's see if a virtual xattr was requested */ - vxattr = ceph_match_vxattr(inode, name); - spin_lock(&ci->i_ceph_lock); dout("getxattr %p ver=%lld index_ver=%lld\n", inode, ci->i_xattrs.version, ci->i_xattrs.index_version); + /* let's see if a virtual xattr was requested */ + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { + err = vxattr->getxattr_cb(ci, value, size); + goto out; + } + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto get_xattr; @@ -592,11 +699,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, spin_lock(&ci->i_ceph_lock); - if (vxattr && vxattr->readonly) { - err = vxattr->getxattr_cb(ci, value, size); - goto out; - } - err = __build_xattrs(inode); if (err < 0) goto out; @@ -604,11 +706,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, get_xattr: err = -ENODATA; /* == ENOATTR */ xattr = __get_xattr(ci, name); - if (!xattr) { - if (vxattr) - err = vxattr->getxattr_cb(ci, value, size); + if (!xattr) goto out; - } err = -ERANGE; if (size && size < xattr->val_len) @@ -664,23 +763,30 @@ list_xattr: vir_namelen = ceph_vxattrs_name_size(vxattrs); /* adding 1 byte per each variable due to the null termination */ - namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; + namelen = ci->i_xattrs.names_size + ci->i_xattrs.count; err = -ERANGE; - if (size && namelen > size) + if (size && vir_namelen + namelen > size) goto out; - err = namelen; + err = namelen + vir_namelen; if (size == 0) goto out; names = __copy_xattr_names(ci, names); /* virtual xattr names, too */ - if (vxattrs) + err = namelen; + if (vxattrs) { for (i = 0; vxattrs[i].name; i++) { - len = sprintf(names, "%s", vxattrs[i].name); - names += len + 1; + if (!vxattrs[i].hidden && + !(vxattrs[i].exists_cb && + !vxattrs[i].exists_cb(ci))) { + len = sprintf(names, "%s", vxattrs[i].name); + names += len + 1; + err += len + 1; + } } + } out: spin_unlock(&ci->i_ceph_lock); @@ -782,6 +888,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name, if (vxattr && vxattr->readonly) return -EOPNOTSUPP; + /* pass any unhandled ceph.* xattrs through to the MDS */ + if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto do_sync_unlocked; + /* preallocate memory for xattr name, value, index node */ err = -ENOMEM; newname = kmemdup(name, name_len + 1, GFP_NOFS); @@ -838,6 +948,7 @@ retry: do_sync: spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked: err = ceph_sync_setxattr(dentry, name, value, size, flags); out: kfree(newname); @@ -892,6 +1003,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name) if (vxattr && vxattr->readonly) return -EOPNOTSUPP; + /* pass any unhandled ceph.* xattrs through to the MDS */ + if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto do_sync_unlocked; + err = -ENOMEM; spin_lock(&ci->i_ceph_lock); retry: @@ -931,6 +1046,7 @@ retry: return err; do_sync: spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked: err = ceph_send_removexattr(dentry, name); out: return err; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 21ff76c22a17..2906ee276408 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -155,14 +155,14 @@ config CIFS_DFS_UPCALL points. If unsure, say N. config CIFS_NFSD_EXPORT - bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)" - depends on CIFS && EXPERIMENTAL && BROKEN + bool "Allow nfsd to export CIFS file system" + depends on CIFS && BROKEN help Allows NFS server to export a CIFS mounted share (nfsd over cifs) config CIFS_SMB2 - bool "SMB2 network file system support (EXPERIMENTAL)" - depends on CIFS && EXPERIMENTAL && INET + bool "SMB2 network file system support" + depends on CIFS && INET select NLS select KEYS select FSCACHE diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index cfd1ce34e0bc..1d36db114772 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -614,53 +614,10 @@ decode_negTokenInit(unsigned char *security_blob, int length, } } - /* mechlistMIC */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - /* Check if we have reached the end of the blob, but with - no mechListMic (e.g. NTLMSSP instead of KRB5) */ - if (ctx.error == ASN1_ERR_DEC_EMPTY) - goto decode_negtoken_exit; - cFYI(1, "Error decoding last part negTokenInit exit3"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - /* tag = 3 indicating mechListMIC */ - cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - - /* sequence */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit5"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - } - - /* sequence of */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit 7"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - - /* general string */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit9"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_PRI) - || (tag != ASN1_GENSTR)) { - cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - cFYI(1, "Need to call asn1_octets_decode() function for %s", - ctx.pointer); /* is this UTF-8 or ASCII? */ -decode_negtoken_exit: + /* + * We currently ignore anything at the end of the SPNEGO blob after + * the mechTypes have been parsed, since none of that info is + * used at the moment. + */ return 1; } diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index c865bfdfe819..37e4a72a7d1c 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -55,10 +55,10 @@ struct cifs_sb_info { unsigned int wsize; unsigned long actimeo; /* attribute cache timeout (jiffies) */ atomic_t active; - uid_t mnt_uid; - gid_t mnt_gid; - uid_t mnt_backupuid; - gid_t mnt_backupgid; + kuid_t mnt_uid; + kgid_t mnt_gid; + kuid_t mnt_backupuid; + kgid_t mnt_backupgid; umode_t mnt_file_mode; umode_t mnt_dir_mode; unsigned int mnt_cifs_flags; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 086f381d6489..10e774761299 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -149,10 +149,12 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) goto out; dp = description + strlen(description); - sprintf(dp, ";uid=0x%x", sesInfo->linux_uid); + sprintf(dp, ";uid=0x%x", + from_kuid_munged(&init_user_ns, sesInfo->linux_uid)); dp = description + strlen(description); - sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid); + sprintf(dp, ";creduid=0x%x", + from_kuid_munged(&init_user_ns, sesInfo->cred_uid)); if (sesInfo->user_name) { dp = description + strlen(description); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 5cbd00e74067..f1e3f25fe004 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -266,8 +266,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct key *sidkey; char *sidstr; const struct cred *saved_cred; - uid_t fuid = cifs_sb->mnt_uid; - gid_t fgid = cifs_sb->mnt_gid; + kuid_t fuid = cifs_sb->mnt_uid; + kgid_t fgid = cifs_sb->mnt_gid; /* * If we have too many subauthorities, then something is really wrong. @@ -297,6 +297,7 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, * probably a safe assumption but might be better to check based on * sidtype. */ + BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); if (sidkey->datalen != sizeof(uid_t)) { rc = -EIO; cFYI(1, "%s: Downcall contained malformed key " @@ -305,10 +306,21 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, goto out_key_put; } - if (sidtype == SIDOWNER) - memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t)); - else - memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t)); + if (sidtype == SIDOWNER) { + kuid_t uid; + uid_t id; + memcpy(&id, &sidkey->payload.value, sizeof(uid_t)); + uid = make_kuid(&init_user_ns, id); + if (uid_valid(uid)) + fuid = uid; + } else { + kgid_t gid; + gid_t id; + memcpy(&id, &sidkey->payload.value, sizeof(gid_t)); + gid = make_kgid(&init_user_ns, id); + if (gid_valid(gid)) + fgid = gid; + } out_key_put: key_put(sidkey); @@ -346,7 +358,8 @@ init_cifs_idmap(void) if (!cred) return -ENOMEM; - keyring = keyring_alloc(".cifs_idmap", 0, 0, cred, + keyring = keyring_alloc(".cifs_idmap", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_NOT_IN_QUOTA, NULL); @@ -774,7 +787,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, - __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag) + __u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid, int *aclflag) { int rc = 0; __u32 dacloffset; @@ -806,17 +819,19 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, *aclflag = CIFS_ACL_DACL; } else { memcpy(pnntsd, pntsd, secdesclen); - if (uid != NO_CHANGE_32) { /* chown */ + if (uid_valid(uid)) { /* chown */ + uid_t id; owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + le32_to_cpu(pnntsd->osidoffset)); nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), GFP_KERNEL); if (!nowner_sid_ptr) return -ENOMEM; - rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr); + id = from_kuid(&init_user_ns, uid); + rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr); if (rc) { cFYI(1, "%s: Mapping error %d for owner id %d", - __func__, rc, uid); + __func__, rc, id); kfree(nowner_sid_ptr); return rc; } @@ -824,17 +839,19 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, kfree(nowner_sid_ptr); *aclflag = CIFS_ACL_OWNER; } - if (gid != NO_CHANGE_32) { /* chgrp */ + if (gid_valid(gid)) { /* chgrp */ + gid_t id; group_sid_ptr = (struct cifs_sid *)((char *)pnntsd + le32_to_cpu(pnntsd->gsidoffset)); ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), GFP_KERNEL); if (!ngroup_sid_ptr) return -ENOMEM; - rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr); + id = from_kgid(&init_user_ns, gid); + rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr); if (rc) { cFYI(1, "%s: Mapping error %d for group id %d", - __func__, rc, gid); + __func__, rc, id); kfree(ngroup_sid_ptr); return rc; } @@ -1002,7 +1019,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, /* Convert mode bits to an ACL so we can update the ACL on the server */ int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, - uid_t uid, gid_t gid) + kuid_t uid, kgid_t gid) { int rc = 0; int aclflag = CIFS_ACL_DACL; /* default flag to set */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index de7f9168a118..345fc89c4286 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -91,6 +91,30 @@ struct workqueue_struct *cifsiod_wq; __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; #endif +/* + * Bumps refcount for cifs super block. + * Note that it should be only called if a referece to VFS super block is + * already held, e.g. in open-type syscalls context. Otherwise it can race with + * atomic_dec_and_test in deactivate_locked_super. + */ +void +cifs_sb_active(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); +} + +void +cifs_sb_deactive(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); +} + static int cifs_read_super(struct super_block *sb) { @@ -375,13 +399,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root) (int)(srcaddr->sa_family)); } - seq_printf(s, ",uid=%u", cifs_sb->mnt_uid); + seq_printf(s, ",uid=%u", + from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid)); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) seq_printf(s, ",forceuid"); else seq_printf(s, ",noforceuid"); - seq_printf(s, ",gid=%u", cifs_sb->mnt_gid); + seq_printf(s, ",gid=%u", + from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid)); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) seq_printf(s, ",forcegid"); else @@ -436,9 +462,13 @@ cifs_show_options(struct seq_file *s, struct dentry *root) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) seq_printf(s, ",noperm"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) - seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid); + seq_printf(s, ",backupuid=%u", + from_kuid_munged(&init_user_ns, + cifs_sb->mnt_backupuid)); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) - seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid); + seq_printf(s, ",backupgid=%u", + from_kgid_munged(&init_user_ns, + cifs_sb->mnt_backupgid)); seq_printf(s, ",rsize=%u", cifs_sb->rsize); seq_printf(s, ",wsize=%u", cifs_sb->wsize); @@ -558,6 +588,11 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) dentry = ERR_PTR(-ENOENT); break; } + if (!S_ISDIR(dir->i_mode)) { + dput(dentry); + dentry = ERR_PTR(-ENOTDIR); + break; + } /* skip separators */ while (*s == sep) @@ -677,7 +712,7 @@ out_nls: static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(iocb->ki_filp); ssize_t written; int rc; @@ -701,7 +736,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) */ if (whence != SEEK_SET && whence != SEEK_CUR) { int rc; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); /* * We need to be sure that all dirty pages are written and the @@ -733,7 +768,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) { /* note that this is called by vfs setlease with lock_flocks held to protect *lease from going away */ - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; if (!(S_ISREG(inode->i_mode))) @@ -766,6 +801,7 @@ struct file_system_type cifs_fs_type = { .kill_sb = cifs_kill_sb, /* .fs_flags */ }; +MODULE_ALIAS_FS("cifs"); const struct inode_operations cifs_dir_inode_ops = { .create = cifs_create, .atomic_open = cifs_atomic_open, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7163419cecd9..0e32c3446ce9 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,6 +41,10 @@ extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; +/* Functions related to super block operations */ +extern void cifs_sb_active(struct super_block *sb); +extern void cifs_sb_deactive(struct super_block *sb); + /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index e6899cea1c35..4f07f6fbe494 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -400,11 +400,11 @@ struct smb_vol { char *iocharset; /* local code page for mapping to and from Unicode */ char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ - uid_t cred_uid; - uid_t linux_uid; - gid_t linux_gid; - uid_t backupuid; - gid_t backupgid; + kuid_t cred_uid; + kuid_t linux_uid; + kgid_t linux_gid; + kuid_t backupuid; + kgid_t backupgid; umode_t file_mode; umode_t dir_mode; unsigned secFlg; @@ -703,8 +703,8 @@ struct cifs_ses { char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ __u64 Suid; /* remote smb uid */ - uid_t linux_uid; /* overriding owner of files on the mount */ - uid_t cred_uid; /* owner of credentials */ + kuid_t linux_uid; /* overriding owner of files on the mount */ + kuid_t cred_uid; /* owner of credentials */ unsigned int capabilities; char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for TCP names - will ipv6 and sctp addresses fit? */ @@ -838,7 +838,7 @@ struct cifs_tcon { */ struct tcon_link { struct rb_node tl_rbnode; - uid_t tl_uid; + kuid_t tl_uid; unsigned long tl_flags; #define TCON_LINK_MASTER 0 #define TCON_LINK_PENDING 1 @@ -931,7 +931,7 @@ struct cifsFileInfo { struct list_head tlist; /* pointer to next fid owned by tcon */ struct list_head flist; /* next fid (file instance) for this inode */ struct cifs_fid_locks *llist; /* brlocks held by this fid */ - unsigned int uid; /* allows finding which FileInfo structure */ + kuid_t uid; /* allows finding which FileInfo structure */ __u32 pid; /* process id who opened file */ struct cifs_fid fid; /* file id from remote */ /* BB add lock scope info here if needed */ ; @@ -1245,8 +1245,8 @@ struct cifs_fattr { u64 cf_eof; u64 cf_bytes; u64 cf_createtime; - uid_t cf_uid; - gid_t cf_gid; + kuid_t cf_uid; + kgid_t cf_gid; umode_t cf_mode; dev_t cf_rdev; unsigned int cf_nlink; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index b9d59a948a2c..e996ff6b26d1 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -277,7 +277,6 @@ #define CIFS_NO_HANDLE 0xFFFF #define NO_CHANGE_64 0xFFFFFFFFFFFFFFFFULL -#define NO_CHANGE_32 0xFFFFFFFFUL /* IPC$ in ASCII */ #define CIFS_IPC_RESOURCE "\x49\x50\x43\x24" diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1988c1baa224..f450f0683ddd 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -46,7 +46,8 @@ extern void _free_xid(unsigned int); ({ \ unsigned int __xid = _get_xid(); \ cFYI(1, "CIFS VFS: in %s as Xid: %u with uid: %d", \ - __func__, __xid, current_fsuid()); \ + __func__, __xid, \ + from_kuid(&init_user_ns, current_fsuid())); \ __xid; \ }) @@ -161,7 +162,7 @@ extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid); extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64, - uid_t, gid_t); + kuid_t, kgid_t); extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, @@ -304,8 +305,8 @@ struct cifs_unix_set_info_args { __u64 atime; __u64 mtime; __u64 mode; - __u64 uid; - __u64 gid; + kuid_t uid; + kgid_t gid; dev_t device; }; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 76d0d2998850..8e2e799e7a24 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1909,9 +1909,12 @@ cifs_writev_requeue(struct cifs_writedata *wdata) } while (rc == -EAGAIN); for (i = 0; i < wdata->nr_pages; i++) { - if (rc != 0) - SetPageError(wdata->pages[i]); unlock_page(wdata->pages[i]); + if (rc != 0) { + SetPageError(wdata->pages[i]); + end_page_writeback(wdata->pages[i]); + page_cache_release(wdata->pages[i]); + } } mapping_set_error(inode->i_mapping, rc); @@ -5819,8 +5822,14 @@ static void cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset, const struct cifs_unix_set_info_args *args) { + u64 uid = NO_CHANGE_64, gid = NO_CHANGE_64; u64 mode = args->mode; + if (uid_valid(args->uid)) + uid = from_kuid(&init_user_ns, args->uid); + if (gid_valid(args->gid)) + gid = from_kgid(&init_user_ns, args->gid); + /* * Samba server ignores set of file size to zero due to bugs in some * older clients, but we should be precise - we use SetFileSize to @@ -5833,8 +5842,8 @@ cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset, data_offset->LastStatusChange = cpu_to_le64(args->ctime); data_offset->LastAccessTime = cpu_to_le64(args->atime); data_offset->LastModificationTime = cpu_to_le64(args->mtime); - data_offset->Uid = cpu_to_le64(args->uid); - data_offset->Gid = cpu_to_le64(args->gid); + data_offset->Uid = cpu_to_le64(uid); + data_offset->Gid = cpu_to_le64(gid); /* better to leave device as zero when it is */ data_offset->DevMajor = cpu_to_le64(MAJOR(args->device)); data_offset->DevMinor = cpu_to_le64(MINOR(args->device)); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 12b3da39733b..991c63c6bdd0 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -97,7 +97,7 @@ enum { Opt_user, Opt_pass, Opt_ip, Opt_unc, Opt_domain, Opt_srcaddr, Opt_prefixpath, - Opt_iocharset, Opt_sockopt, + Opt_iocharset, Opt_netbiosname, Opt_servern, Opt_ver, Opt_vers, Opt_sec, Opt_cache, @@ -202,7 +202,6 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_srcaddr, "srcaddr=%s" }, { Opt_prefixpath, "prefixpath=%s" }, { Opt_iocharset, "iocharset=%s" }, - { Opt_sockopt, "sockopt=%s" }, { Opt_netbiosname, "netbiosname=%s" }, { Opt_servern, "servern=%s" }, { Opt_ver, "ver=%s" }, @@ -987,6 +986,41 @@ static int get_option_ul(substring_t args[], unsigned long *option) return rc; } +static int get_option_uid(substring_t args[], kuid_t *result) +{ + unsigned long value; + kuid_t uid; + int rc; + + rc = get_option_ul(args, &value); + if (rc) + return rc; + + uid = make_kuid(current_user_ns(), value); + if (!uid_valid(uid)) + return -EINVAL; + + *result = uid; + return 0; +} + +static int get_option_gid(substring_t args[], kgid_t *result) +{ + unsigned long value; + kgid_t gid; + int rc; + + rc = get_option_ul(args, &value); + if (rc) + return rc; + + gid = make_kgid(current_user_ns(), value); + if (!gid_valid(gid)) + return -EINVAL; + + *result = gid; + return 0; +} static int cifs_parse_security_flavors(char *value, struct smb_vol *vol) @@ -996,7 +1030,7 @@ static int cifs_parse_security_flavors(char *value, switch (match_token(value, cifs_secflavor_tokens, args)) { case Opt_sec_krb5: - vol->secFlg |= CIFSSEC_MAY_KRB5; + vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_SIGN; break; case Opt_sec_krb5i: vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN; @@ -1424,47 +1458,42 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* Numeric Values */ case Opt_backupuid: - if (get_option_ul(args, &option)) { + if (get_option_uid(args, &vol->backupuid)) { cERROR(1, "%s: Invalid backupuid value", __func__); goto cifs_parse_mount_err; } - vol->backupuid = option; vol->backupuid_specified = true; break; case Opt_backupgid: - if (get_option_ul(args, &option)) { + if (get_option_gid(args, &vol->backupgid)) { cERROR(1, "%s: Invalid backupgid value", __func__); goto cifs_parse_mount_err; } - vol->backupgid = option; vol->backupgid_specified = true; break; case Opt_uid: - if (get_option_ul(args, &option)) { + if (get_option_uid(args, &vol->linux_uid)) { cERROR(1, "%s: Invalid uid value", __func__); goto cifs_parse_mount_err; } - vol->linux_uid = option; uid_specified = true; break; case Opt_cruid: - if (get_option_ul(args, &option)) { + if (get_option_uid(args, &vol->cred_uid)) { cERROR(1, "%s: Invalid cruid value", __func__); goto cifs_parse_mount_err; } - vol->cred_uid = option; break; case Opt_gid: - if (get_option_ul(args, &option)) { + if (get_option_gid(args, &vol->linux_gid)) { cERROR(1, "%s: Invalid gid value", __func__); goto cifs_parse_mount_err; } - vol->linux_gid = option; gid_specified = true; break; case Opt_file_mode: @@ -1722,19 +1751,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, */ cFYI(1, "iocharset set to %s", string); break; - case Opt_sockopt: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - - if (strnicmp(string, "TCP_NODELAY", 11) == 0) { - printk(KERN_WARNING "CIFS: the " - "sockopt=TCP_NODELAY option has been " - "deprecated and will be removed " - "in 3.9\n"); - vol->sockopt_tcp_nodelay = 1; - } - break; case Opt_netbiosname: string = match_strdup(args); if (string == NULL) @@ -2241,7 +2257,7 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol) { switch (ses->server->secType) { case Kerberos: - if (vol->cred_uid != ses->cred_uid) + if (!uid_eq(vol->cred_uid, ses->cred_uid)) return 0; break; default: @@ -2713,7 +2729,7 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) if (new->rsize && new->rsize < old->rsize) return 0; - if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid) + if (!uid_eq(old->mnt_uid, new->mnt_uid) || !gid_eq(old->mnt_gid, new->mnt_gid)) return 0; if (old->mnt_file_mode != new->mnt_file_mode || @@ -3919,7 +3935,7 @@ cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses) } static struct cifs_tcon * -cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid) +cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) { int rc; struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb); @@ -3989,7 +4005,7 @@ cifs_sb_tcon_pending_wait(void *unused) /* find and return a tlink with given uid */ static struct tcon_link * -tlink_rb_search(struct rb_root *root, uid_t uid) +tlink_rb_search(struct rb_root *root, kuid_t uid) { struct rb_node *node = root->rb_node; struct tcon_link *tlink; @@ -3997,9 +4013,9 @@ tlink_rb_search(struct rb_root *root, uid_t uid) while (node) { tlink = rb_entry(node, struct tcon_link, tl_rbnode); - if (tlink->tl_uid > uid) + if (uid_gt(tlink->tl_uid, uid)) node = node->rb_left; - else if (tlink->tl_uid < uid) + else if (uid_lt(tlink->tl_uid, uid)) node = node->rb_right; else return tlink; @@ -4018,7 +4034,7 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) tlink = rb_entry(*new, struct tcon_link, tl_rbnode); parent = *new; - if (tlink->tl_uid > new_tlink->tl_uid) + if (uid_gt(tlink->tl_uid, new_tlink->tl_uid)) new = &((*new)->rb_left); else new = &((*new)->rb_right); @@ -4048,7 +4064,7 @@ struct tcon_link * cifs_sb_tlink(struct cifs_sb_info *cifs_sb) { int ret; - uid_t fsuid = current_fsuid(); + kuid_t fsuid = current_fsuid(); struct tcon_link *tlink, *newtlink; if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 8719bbe0dcc3..1cd016217448 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -342,14 +342,14 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, *created |= FILE_CREATED; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = (__u64) current_fsuid(); + args.uid = current_fsuid(); if (inode->i_mode & S_ISGID) - args.gid = (__u64) inode->i_gid; + args.gid = inode->i_gid; else - args.gid = (__u64) current_fsgid(); + args.gid = current_fsgid(); } else { - args.uid = NO_CHANGE_64; - args.gid = NO_CHANGE_64; + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ } CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid->netfid, current->tgid); @@ -588,11 +588,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, .device = device_number, }; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = (__u64) current_fsuid(); - args.gid = (__u64) current_fsgid(); + args.uid = current_fsuid(); + args.gid = current_fsgid(); } else { - args.uid = NO_CHANGE_64; - args.gid = NO_CHANGE_64; + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ } rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args, cifs_sb->local_nls, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8ea6ca50a665..7a0dd99e4507 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -43,6 +43,7 @@ #include "cifs_fs_sb.h" #include "fscache.h" + static inline int cifs_convert_flags(unsigned int flags) { if ((flags & O_ACCMODE) == O_RDONLY) @@ -72,10 +73,15 @@ static u32 cifs_posix_convert_flags(unsigned int flags) else if ((flags & O_ACCMODE) == O_RDWR) posix_flags = SMB_O_RDWR; - if (flags & O_CREAT) + if (flags & O_CREAT) { posix_flags |= SMB_O_CREAT; - if (flags & O_EXCL) - posix_flags |= SMB_O_EXCL; + if (flags & O_EXCL) + posix_flags |= SMB_O_EXCL; + } else if (flags & O_EXCL) + cFYI(1, "Application %s pid %d has incorrectly set O_EXCL flag" + "but not O_CREAT on file open. Ignoring O_EXCL", + current->comm, current->tgid); + if (flags & O_TRUNC) posix_flags |= SMB_O_TRUNC; /* be safe and imply O_SYNC for O_DSYNC */ @@ -294,6 +300,8 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + cifs_sb_active(inode->i_sb); + /* * If the server returned a read oplock and we have mandatory brlocks, * set oplock level to None. @@ -343,7 +351,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifsLockInfo *li, *tmp; struct cifs_fid fid; struct cifs_pending_open open; @@ -408,6 +417,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); + cifs_sb_deactive(sb); kfree(cifs_file); } @@ -515,8 +525,8 @@ int cifs_open(struct inode *inode, struct file *file) */ struct cifs_unix_set_info_args args = { .mode = inode->i_mode, - .uid = NO_CHANGE_64, - .gid = NO_CHANGE_64, + .uid = INVALID_UID, /* no change */ + .gid = INVALID_GID, /* no change */ .ctime = NO_CHANGE_64, .atime = NO_CHANGE_64, .mtime = NO_CHANGE_64, @@ -947,7 +957,7 @@ static int cifs_posix_lock_test(struct file *file, struct file_lock *flock) { int rc = 0; - struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(file_inode(file)); unsigned char saved_type = flock->fl_type; if ((flock->fl_flags & FL_POSIX) == 0) @@ -974,7 +984,7 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock) static int cifs_posix_lock_set(struct file *file, struct file_lock *flock) { - struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(file_inode(file)); int rc = 1; if ((flock->fl_flags & FL_POSIX) == 0) @@ -1548,7 +1558,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); netfid = cfile->fid.netfid; - cinode = CIFS_I(file->f_path.dentry->d_inode); + cinode = CIFS_I(file_inode(file)); if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && @@ -1693,7 +1703,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, are always at the end of the list but since the first entry might have a close pending, we go through the whole list */ list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { - if (fsuid_only && open_file->uid != current_fsuid()) + if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) { if (!open_file->invalidHandle) { @@ -1746,7 +1756,7 @@ refind_writable: list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (!any_available && open_file->pid != current->tgid) continue; - if (fsuid_only && open_file->uid != current_fsuid()) + if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { @@ -2171,7 +2181,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct cifsFileInfo *smbfile = file->private_data; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); rc = filemap_write_and_wait_range(inode->i_mapping, start, end); @@ -2246,7 +2256,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ int cifs_flush(struct file *file, fl_owner_t id) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); int rc = 0; if (file->f_mode & FMODE_WRITE) @@ -2480,7 +2490,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, ssize_t written; struct inode *inode; - inode = iocb->ki_filp->f_path.dentry->d_inode; + inode = file_inode(iocb->ki_filp); /* * BB - optimize the way when signing is disabled. We can drop this @@ -2543,7 +2553,7 @@ ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(iocb->ki_filp); struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsFileInfo *cfile = (struct cifsFileInfo *) @@ -2915,7 +2925,7 @@ ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(iocb->ki_filp); struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsFileInfo *cfile = (struct cifsFileInfo *) @@ -3063,7 +3073,7 @@ static struct vm_operations_struct cifs_file_vm_ops = { int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) { int rc, xid; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); xid = get_xid(); @@ -3356,7 +3366,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page, int rc; /* Is the page cached? */ - rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page); + rc = cifs_readpage_from_fscache(file_inode(file), page); if (rc == 0) goto read_complete; @@ -3371,8 +3381,8 @@ static int cifs_readpage_worker(struct file *file, struct page *page, else cFYI(1, "Bytes read %d", rc); - file->f_path.dentry->d_inode->i_atime = - current_fs_time(file->f_path.dentry->d_inode->i_sb); + file_inode(file)->i_atime = + current_fs_time(file_inode(file)->i_sb); if (PAGE_CACHE_SIZE > rc) memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc); @@ -3381,7 +3391,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page, SetPageUptodate(page); /* send this page to the cache */ - cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page); + cifs_readpage_to_fscache(file_inode(file), page); rc = 0; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index ed6208ff85a7..20887bf63121 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -244,15 +244,25 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, break; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) - fattr->cf_uid = cifs_sb->mnt_uid; - else - fattr->cf_uid = le64_to_cpu(info->Uid); - - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) - fattr->cf_gid = cifs_sb->mnt_gid; - else - fattr->cf_gid = le64_to_cpu(info->Gid); + fattr->cf_uid = cifs_sb->mnt_uid; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) { + u64 id = le64_to_cpu(info->Uid); + if (id < ((uid_t)-1)) { + kuid_t uid = make_kuid(&init_user_ns, id); + if (uid_valid(uid)) + fattr->cf_uid = uid; + } + } + + fattr->cf_gid = cifs_sb->mnt_gid; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) { + u64 id = le64_to_cpu(info->Gid); + if (id < ((gid_t)-1)) { + kgid_t gid = make_kgid(&init_user_ns, id); + if (gid_valid(gid)) + fattr->cf_gid = gid; + } + } fattr->cf_nlink = le64_to_cpu(info->Nlinks); } @@ -289,7 +299,7 @@ cifs_get_file_info_unix(struct file *filp) unsigned int xid; FILE_UNIX_BASIC_INFO find_data; struct cifs_fattr fattr; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); @@ -558,7 +568,7 @@ cifs_get_file_info(struct file *filp) unsigned int xid; FILE_ALL_INFO find_data; struct cifs_fattr fattr; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); @@ -806,10 +816,9 @@ static bool inode_has_hashed_dentries(struct inode *inode) { struct dentry *dentry; - struct hlist_node *p; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { if (!d_unhashed(dentry) || IS_ROOT(dentry)) { spin_unlock(&inode->i_lock); return true; @@ -986,6 +995,15 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + /* + * We cannot rename the file if the server doesn't support + * CAP_INFOLEVEL_PASSTHRU + */ + if (!(tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)) { + rc = -EBUSY; + goto out; + } + rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, &netfid, &oplock, NULL, cifs_sb->local_nls, @@ -1014,7 +1032,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, current->tgid); /* although we would like to mark the file hidden if that fails we will still try to rename it */ - if (rc != 0) + if (!rc) cifsInode->cifsAttrs = dosattr; else dosattr = origattr; /* since not able to change them */ @@ -1025,7 +1043,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc != 0) { - rc = -ETXTBSY; + rc = -EBUSY; goto undo_setattr; } @@ -1044,7 +1062,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, if (rc == -ENOENT) rc = 0; else if (rc != 0) { - rc = -ETXTBSY; + rc = -EBUSY; goto undo_rename; } cifsInode->delete_pending = true; @@ -1151,15 +1169,13 @@ psx_del_no_retry: cifs_drop_nlink(inode); } else if (rc == -ENOENT) { d_drop(dentry); - } else if (rc == -ETXTBSY) { + } else if (rc == -EBUSY) { if (server->ops->rename_pending_delete) { rc = server->ops->rename_pending_delete(full_path, dentry, xid); if (rc == 0) cifs_drop_nlink(inode); } - if (rc == -ETXTBSY) - rc = -EBUSY; } else if ((rc == -EACCES) && (dosattr == 0) && inode) { attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (attrs == NULL) { @@ -1245,14 +1261,14 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, .device = 0, }; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = (__u64)current_fsuid(); + args.uid = current_fsuid(); if (parent->i_mode & S_ISGID) - args.gid = (__u64)parent->i_gid; + args.gid = parent->i_gid; else - args.gid = (__u64)current_fsgid(); + args.gid = current_fsgid(); } else { - args.uid = NO_CHANGE_64; - args.gid = NO_CHANGE_64; + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ } CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, cifs_sb->local_nls, @@ -1500,7 +1516,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, * source. Note that cross directory moves do not work with * rename by filehandle to various Windows servers. */ - if (rc == 0 || rc != -ETXTBSY) + if (rc == 0 || rc != -EBUSY) goto do_rename_exit; /* open-file renames don't work across directories */ @@ -1678,7 +1694,7 @@ cifs_invalidate_mapping(struct inode *inode) int cifs_revalidate_file_attr(struct file *filp) { int rc = 0; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; if (!cifs_inode_needs_reval(inode)) @@ -1735,7 +1751,7 @@ out: int cifs_revalidate_file(struct file *filp) { int rc; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); rc = cifs_revalidate_file_attr(filp); if (rc) @@ -2013,12 +2029,12 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (attrs->ia_valid & ATTR_UID) args->uid = attrs->ia_uid; else - args->uid = NO_CHANGE_64; + args->uid = INVALID_UID; /* no change */ if (attrs->ia_valid & ATTR_GID) args->gid = attrs->ia_gid; else - args->gid = NO_CHANGE_64; + args->gid = INVALID_GID; /* no change */ if (attrs->ia_valid & ATTR_ATIME) args->atime = cifs_UnixTimeToNT(attrs->ia_atime); @@ -2086,8 +2102,8 @@ static int cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) { unsigned int xid; - uid_t uid = NO_CHANGE_32; - gid_t gid = NO_CHANGE_32; + kuid_t uid = INVALID_UID; + kgid_t gid = INVALID_GID; struct inode *inode = direntry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cifsInode = CIFS_I(inode); @@ -2146,7 +2162,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) #ifdef CONFIG_CIFS_ACL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) { + if (uid_valid(uid) || gid_valid(gid)) { rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64, uid, gid); if (rc) { @@ -2170,7 +2186,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) #ifdef CONFIG_CIFS_ACL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { rc = id_mode_to_cifs_acl(inode, full_path, mode, - NO_CHANGE_32, NO_CHANGE_32); + INVALID_UID, INVALID_GID); if (rc) { cFYI(1, "%s: Setting ACL failed with error: %d", __func__, rc); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index fd5009d56f9f..6c9f1214cf0b 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -30,7 +30,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { - struct inode *inode = filep->f_dentry->d_inode; + struct inode *inode = file_inode(filep); int rc = -ENOTTY; /* strange error - but the precedent */ unsigned int xid; struct cifs_sb_info *cifs_sb; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 51dc2fb6e854..9f6c4c45d21e 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -76,7 +76,7 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) } rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); if (rc) { - cERROR(1, "%s: Could not update iwth link_str", __func__); + cERROR(1, "%s: Could not update with link_str", __func__); goto symlink_hash_err; } rc = crypto_shash_final(&sdescmd5->shash, md5_hash); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 3a00c0d0cead..1b15bf839f37 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -569,7 +569,7 @@ bool backup_cred(struct cifs_sb_info *cifs_sb) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) { - if (cifs_sb->mnt_backupuid == current_fsuid()) + if (uid_eq(cifs_sb->mnt_backupuid, current_fsuid())) return true; } if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) { diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index a82bc51fdc82..c0b25b28be6c 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -62,7 +62,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRdiffdevice, -EXDEV}, {ERRnofiles, -ENOENT}, {ERRwriteprot, -EROFS}, - {ERRbadshare, -ETXTBSY}, + {ERRbadshare, -EBUSY}, {ERRlock, -EACCES}, {ERRunsup, -EINVAL}, {ERRnosuchshare, -ENXIO}, diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index cdd6ff48246b..df40cc5fd13a 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -82,12 +82,10 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, cFYI(1, "%s: for %s", __func__, name->name); - if (parent->d_op && parent->d_op->d_hash) - parent->d_op->d_hash(parent, parent->d_inode, name); - else - name->hash = full_name_hash(name->name, name->len); + dentry = d_hash_and_lookup(parent, name); + if (unlikely(IS_ERR(dentry))) + return; - dentry = d_lookup(parent, name); if (dentry) { int err; @@ -505,7 +503,7 @@ static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode) whether we can use the cached search results from the previous search */ static int is_dir_changed(struct file *file) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct cifsInodeInfo *cifsInfo = CIFS_I(inode); if (cifsInfo->time == 0) @@ -778,7 +776,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) switch ((int) file->f_pos) { case 0: if (filldir(direntry, ".", 1, file->f_pos, - file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) { + file_inode(file)->i_ino, DT_DIR) < 0) { cERROR(1, "Filldir for current dir failed"); rc = -ENOMEM; break; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c9c7aa7ed966..bceffe7b8f8d 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -744,4 +744,5 @@ struct smb_version_values smb30_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, + .oplock_read = SMB2_OPLOCK_LEVEL_II, }; diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 958ae0e0ff8c..1da168c61d35 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -33,7 +33,7 @@ void coda_cache_enter(struct inode *inode, int mask) spin_lock(&cii->c_lock); cii->c_cached_epoch = atomic_read(&permission_epoch); - if (cii->c_uid != current_fsuid()) { + if (!uid_eq(cii->c_uid, current_fsuid())) { cii->c_uid = current_fsuid(); cii->c_cached_perm = mask; } else @@ -65,7 +65,7 @@ int coda_cache_check(struct inode *inode, int mask) spin_lock(&cii->c_lock); hit = (mask & cii->c_cached_perm) == mask && - cii->c_uid == current_fsuid() && + uid_eq(cii->c_uid, current_fsuid()) && cii->c_cached_epoch == atomic_read(&permission_epoch); spin_unlock(&cii->c_lock); diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h index b24fdfd8a3f0..c64075213218 100644 --- a/fs/coda/coda_fs_i.h +++ b/fs/coda/coda_fs_i.h @@ -25,7 +25,7 @@ struct coda_inode_info { u_short c_flags; /* flags (see below) */ unsigned int c_mapcount; /* nr of times this inode is mapped */ unsigned int c_cached_epoch; /* epoch for cached permissions */ - vuid_t c_uid; /* fsuid for cached permissions */ + kuid_t c_uid; /* fsuid for cached permissions */ unsigned int c_cached_perm; /* cached access permissions */ spinlock_t c_lock; struct inode vfs_inode; diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 854ace712685..2849f41e72a2 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -100,9 +100,9 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_mode != (u_short) -1) inode->i_mode = attr->va_mode | inode_type; if (attr->va_uid != -1) - inode->i_uid = (uid_t) attr->va_uid; + inode->i_uid = make_kuid(&init_user_ns, (uid_t) attr->va_uid); if (attr->va_gid != -1) - inode->i_gid = (gid_t) attr->va_gid; + inode->i_gid = make_kgid(&init_user_ns, (gid_t) attr->va_gid); if (attr->va_nlink != -1) set_nlink(inode, attr->va_nlink); if (attr->va_size != -1) @@ -171,10 +171,10 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) vattr->va_mode = iattr->ia_mode; } if ( valid & ATTR_UID ) { - vattr->va_uid = (vuid_t) iattr->ia_uid; + vattr->va_uid = (vuid_t) from_kuid(&init_user_ns, iattr->ia_uid); } if ( valid & ATTR_GID ) { - vattr->va_gid = (vgid_t) iattr->ia_gid; + vattr->va_gid = (vgid_t) from_kgid(&init_user_ns, iattr->ia_gid); } if ( valid & ATTR_SIZE ) { vattr->va_size = iattr->ia_size; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 49fe52d25600..b7d3a05c062c 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -397,7 +397,7 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir) * We can't use vfs_readdir because we have to keep the file * position in sync between the coda_file and the host_file. * and as such we need grab the inode mutex. */ - struct inode *host_inode = host_file->f_path.dentry->d_inode; + struct inode *host_inode = file_inode(host_file); mutex_lock(&host_inode->i_mutex); host_file->f_pos = coda_file->f_pos; diff --git a/fs/coda/file.c b/fs/coda/file.c index 8edd404e6419..fa4c100bdc7d 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -66,7 +66,7 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos, static ssize_t coda_file_write(struct file *coda_file, const char __user *buf, size_t count, loff_t *ppos) { - struct inode *host_inode, *coda_inode = coda_file->f_path.dentry->d_inode; + struct inode *host_inode, *coda_inode = file_inode(coda_file); struct coda_file_info *cfi; struct file *host_file; ssize_t ret; @@ -78,7 +78,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo if (!host_file->f_op || !host_file->f_op->write) return -EINVAL; - host_inode = host_file->f_path.dentry->d_inode; + host_inode = file_inode(host_file); mutex_lock(&coda_inode->i_mutex); ret = host_file->f_op->write(host_file, buf, count, ppos); @@ -106,8 +106,8 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) if (!host_file->f_op || !host_file->f_op->mmap) return -ENODEV; - coda_inode = coda_file->f_path.dentry->d_inode; - host_inode = host_file->f_path.dentry->d_inode; + coda_inode = file_inode(coda_file); + host_inode = file_inode(host_file); cii = ITOC(coda_inode); spin_lock(&cii->c_lock); @@ -178,7 +178,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file) err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags, coda_file->f_cred->fsuid); - host_inode = cfi->cfi_container->f_path.dentry->d_inode; + host_inode = file_inode(cfi->cfi_container); cii = ITOC(coda_inode); /* did we mmap this file? */ @@ -202,7 +202,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file) int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) { struct file *host_file; - struct inode *coda_inode = coda_file->f_path.dentry->d_inode; + struct inode *coda_inode = file_inode(coda_file); struct coda_file_info *cfi; int err; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index be2aa4909487..4dcc0d81a7aa 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -20,6 +20,7 @@ #include <linux/file.h> #include <linux/vfs.h> #include <linux/slab.h> +#include <linux/pid_namespace.h> #include <asm/uaccess.h> @@ -48,7 +49,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb) return NULL; memset(&ei->c_fid, 0, sizeof(struct CodaFid)); ei->c_flags = 0; - ei->c_uid = 0; + ei->c_uid = GLOBAL_ROOT_UID; ei->c_cached_perm = 0; spin_lock_init(&ei->c_lock); return &ei->vfs_inode; @@ -129,7 +130,7 @@ static int get_device_index(struct coda_mount_data *data) f = fdget(data->fd); if (!f.file) goto Ebadf; - inode = f.file->f_path.dentry->d_inode; + inode = file_inode(f.file); if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) { fdput(f); goto Ebadf; @@ -157,6 +158,9 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) int error; int idx; + if (task_active_pid_ns(current) != &init_pid_ns) + return -EINVAL; + idx = get_device_index((struct coda_mount_data *) data); /* Ignore errors in data, for backward compatibility */ @@ -325,4 +329,5 @@ struct file_system_type coda_fs_type = { .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("coda"); diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index ee0981f1375b..3f5de96bbb58 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -52,7 +52,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd, struct path path; int error; struct PioctlData data; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct inode *target_inode = NULL; struct coda_inode_info *cnp; diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 761d5b31b18d..ebc2bae6c289 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -37,6 +37,7 @@ #include <linux/list.h> #include <linux/mutex.h> #include <linux/device.h> +#include <linux/pid_namespace.h> #include <asm/io.h> #include <asm/poll.h> #include <asm/uaccess.h> @@ -266,6 +267,12 @@ static int coda_psdev_open(struct inode * inode, struct file * file) struct venus_comm *vcp; int idx, err; + if (task_active_pid_ns(current) != &init_pid_ns) + return -EINVAL; + + if (current_user_ns() != &init_user_ns) + return -EINVAL; + idx = iminor(inode); if (idx < 0 || idx >= MAX_CODADEVS) return -ENODEV; diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 0c68fd31fbf2..3a731976dc5e 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -50,9 +50,9 @@ static void *alloc_upcall(int opcode, int size) return ERR_PTR(-ENOMEM); inp->ih.opcode = opcode; - inp->ih.pid = current->pid; - inp->ih.pgid = task_pgrp_nr(current); - inp->ih.uid = current_fsuid(); + inp->ih.pid = task_pid_nr_ns(current, &init_pid_ns); + inp->ih.pgid = task_pgrp_nr_ns(current, &init_pid_ns); + inp->ih.uid = from_kuid(&init_user_ns, current_fsuid()); return (void*)inp; } @@ -157,7 +157,7 @@ int venus_lookup(struct super_block *sb, struct CodaFid *fid, } int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, - vuid_t uid) + kuid_t uid) { union inputArgs *inp; union outputArgs *outp; @@ -166,7 +166,7 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, insize = SIZE(release); UPARG(CODA_CLOSE); - inp->ih.uid = uid; + inp->ih.uid = from_kuid(&init_user_ns, uid); inp->coda_close.VFid = *fid; inp->coda_close.flags = flags; diff --git a/fs/compat.c b/fs/compat.c index 015e1e1f87c6..d487985dd0ea 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -558,6 +558,10 @@ ssize_t compat_rw_copy_check_uvector(int type, } *ret_pointer = iov; + ret = -EFAULT; + if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + goto out; + /* * Single unix specification: * We should -EINVAL if an element length is not >= 0 and fitting an @@ -1080,17 +1084,12 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, if (!file->f_op) goto out; - ret = -EFAULT; - if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) - goto out; - - tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, + ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, UIO_FASTIOV, iovstack, &iov); - if (tot_len == 0) { - ret = 0; + if (ret <= 0) goto out; - } + tot_len = ret; ret = rw_verify_area(type, file, pos, tot_len); if (ret < 0) goto out; @@ -1278,8 +1277,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, * Exactly like fs/open.c:sys_open(), except that it doesn't set the * O_LARGEFILE flag. */ -asmlinkage long -compat_sys_open(const char __user *filename, int flags, umode_t mode) +COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(AT_FDCWD, filename, flags, mode); } @@ -1288,8 +1286,7 @@ compat_sys_open(const char __user *filename, int flags, umode_t mode) * Exactly like fs/open.c:sys_openat(), except that it doesn't set the * O_LARGEFILE flag. */ -asmlinkage long -compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, umode_t mode) +COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(dfd, filename, flags, mode); } @@ -1739,55 +1736,13 @@ asmlinkage long compat_sys_signalfd(int ufd, } #endif /* CONFIG_SIGNALFD */ -#ifdef CONFIG_TIMERFD - -asmlinkage long compat_sys_timerfd_settime(int ufd, int flags, - const struct compat_itimerspec __user *utmr, - struct compat_itimerspec __user *otmr) -{ - int error; - struct itimerspec t; - struct itimerspec __user *ut; - - if (get_compat_itimerspec(&t, utmr)) - return -EFAULT; - ut = compat_alloc_user_space(2 * sizeof(struct itimerspec)); - if (copy_to_user(&ut[0], &t, sizeof(t))) - return -EFAULT; - error = sys_timerfd_settime(ufd, flags, &ut[0], &ut[1]); - if (!error && otmr) - error = (copy_from_user(&t, &ut[1], sizeof(struct itimerspec)) || - put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0; - - return error; -} - -asmlinkage long compat_sys_timerfd_gettime(int ufd, - struct compat_itimerspec __user *otmr) -{ - int error; - struct itimerspec t; - struct itimerspec __user *ut; - - ut = compat_alloc_user_space(sizeof(struct itimerspec)); - error = sys_timerfd_gettime(ufd, ut); - if (!error) - error = (copy_from_user(&t, ut, sizeof(struct itimerspec)) || - put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0; - - return error; -} - -#endif /* CONFIG_TIMERFD */ - #ifdef CONFIG_FHANDLE /* * Exactly like fs/open.c:sys_open_by_handle_at(), except that it * doesn't set the O_LARGEFILE flag. */ -asmlinkage long -compat_sys_open_by_handle_at(int mountdirfd, - struct file_handle __user *handle, int flags) +COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, int, flags) { return do_handle_open(mountdirfd, handle, flags); } diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index e2f57a007029..3ced75f765ca 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1582,7 +1582,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, case FIBMAP: case FIGETBSZ: case FIONREAD: - if (S_ISREG(f.file->f_path.dentry->d_inode->i_mode)) + if (S_ISREG(file_inode(f.file)->i_mode)) break; /*FALL THROUGH*/ diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 712b10f64c70..7aabc6ad4e9b 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1037,10 +1037,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level) static int configfs_depend_prep(struct dentry *origin, struct config_item *target) { - struct configfs_dirent *child_sd, *sd = origin->d_fsdata; + struct configfs_dirent *child_sd, *sd; int ret = 0; - BUG_ON(!origin || !sd); + BUG_ON(!origin || !origin->d_fsdata); + sd = origin->d_fsdata; if (sd->s_element == target) /* Boo-yah */ goto out; @@ -1625,7 +1626,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); + mutex_unlock(&file_inode(file)->i_mutex); return -EINVAL; } if (offset != file->f_pos) { diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index aee0a7ebbd8e..7f26c3cf75ae 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -114,6 +114,7 @@ static struct file_system_type configfs_fs_type = { .mount = configfs_do_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("configfs"); struct dentry *configfs_pin_fs(void) { diff --git a/fs/coredump.c b/fs/coredump.c index 177493272a61..c6479658d487 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -411,7 +411,7 @@ static void wait_for_dump_helpers(struct file *file) { struct pipe_inode_info *pipe; - pipe = file->f_path.dentry->d_inode->i_pipe; + pipe = file_inode(file)->i_pipe; pipe_lock(pipe); pipe->readers++; @@ -501,7 +501,7 @@ void do_coredump(siginfo_t *siginfo) * so we dump it as root in mode 2, and only into a controlled * environment (pipe handler or fully qualified path). */ - if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) { + if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) { /* Setuid core dump mode */ flag = O_EXCL; /* Stop rewrite attacks */ cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ @@ -600,7 +600,7 @@ void do_coredump(siginfo_t *siginfo) if (IS_ERR(cprm.file)) goto fail_unlock; - inode = cprm.file->f_path.dentry->d_inode; + inode = file_inode(cprm.file); if (inode->i_nlink > 1) goto close_fail; if (d_unhashed(cprm.file->f_path.dentry)) diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index c6c3f91ecf06..35b1c7bd18b7 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -351,7 +351,7 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; char *buf; unsigned int offset; @@ -573,6 +573,7 @@ static struct file_system_type cramfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("cramfs"); static int __init init_cramfs_fs(void) { diff --git a/fs/dcache.c b/fs/dcache.c index 19153a0a810c..e8bc3420d63e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -675,11 +675,10 @@ EXPORT_SYMBOL(dget_parent); static struct dentry *__d_find_alias(struct inode *inode, int want_discon) { struct dentry *alias, *discon_alias; - struct hlist_node *p; again: discon_alias = NULL; - hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(alias, &inode->i_dentry, d_alias) { spin_lock(&alias->d_lock); if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { if (IS_ROOT(alias) && @@ -730,10 +729,9 @@ EXPORT_SYMBOL(d_find_alias); void d_prune_aliases(struct inode *inode) { struct dentry *dentry; - struct hlist_node *p; restart: spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_count) { __dget_dlock(dentry); @@ -1358,6 +1356,7 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | + DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE )); dentry->d_op = op; if (!op) @@ -1368,6 +1367,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_COMPARE; if (op->d_revalidate) dentry->d_flags |= DCACHE_OP_REVALIDATE; + if (op->d_weak_revalidate) + dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) @@ -1440,14 +1441,13 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry, int len = entry->d_name.len; const char *name = entry->d_name.name; unsigned int hash = entry->d_name.hash; - struct hlist_node *p; if (!inode) { __d_instantiate(entry, NULL); return NULL; } - hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(alias, &inode->i_dentry, d_alias) { /* * Don't need alias->d_lock here, because aliases with * d_parent == entry->d_parent are not subject to name or @@ -1672,7 +1672,6 @@ EXPORT_SYMBOL(d_splice_alias); struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { - int error; struct dentry *found; struct dentry *new; @@ -1681,10 +1680,12 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); + if (unlikely(IS_ERR(found))) + goto err_out; if (!found) { new = d_alloc(dentry->d_parent, name); if (!new) { - error = -ENOMEM; + found = ERR_PTR(-ENOMEM); goto err_out; } @@ -1725,7 +1726,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, err_out: iput(inode); - return ERR_PTR(error); + return found; } EXPORT_SYMBOL(d_add_ci); @@ -1889,7 +1890,7 @@ seqretry: * dentry is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned if the dentry does not exist. */ -struct dentry *d_lookup(struct dentry *parent, struct qstr *name) +struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) { struct dentry *dentry; unsigned seq; @@ -1919,7 +1920,7 @@ EXPORT_SYMBOL(d_lookup); * * __d_lookup callers must be commented. */ -struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) +struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { unsigned int len = name->len; unsigned int hash = name->hash; @@ -1997,12 +1998,10 @@ next: * @dir: Directory to search in * @name: qstr of name we wish to find * - * On hash failure or on lookup failure NULL is returned. + * On lookup failure NULL is returned; on bad name - ERR_PTR(-error) */ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) { - struct dentry *dentry = NULL; - /* * Check for a fs-specific hash function. Note that we must * calculate the standard hash first, as the d_op->d_hash() @@ -2010,13 +2009,13 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) */ name->hash = full_name_hash(name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { - if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0) - goto out; + int err = dir->d_op->d_hash(dir, dir->d_inode, name); + if (unlikely(err < 0)) + return ERR_PTR(err); } - dentry = d_lookup(dir, name); -out: - return dentry; + return d_lookup(dir, name); } +EXPORT_SYMBOL(d_hash_and_lookup); /** * d_validate - verify dentry provided from insecure source (deprecated) @@ -2394,7 +2393,7 @@ out_err: */ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) { - struct dentry *dparent, *aparent; + struct dentry *dparent; dentry_lock_for_move(anon, dentry); @@ -2402,24 +2401,15 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) write_seqcount_begin(&anon->d_seq); dparent = dentry->d_parent; - aparent = anon->d_parent; switch_names(dentry, anon); swap(dentry->d_name.hash, anon->d_name.hash); - dentry->d_parent = (aparent == anon) ? dentry : aparent; - list_del(&dentry->d_u.d_child); - if (!IS_ROOT(dentry)) - list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); - else - INIT_LIST_HEAD(&dentry->d_u.d_child); - - anon->d_parent = (dparent == dentry) ? anon : dparent; + dentry->d_parent = dentry; + list_del_init(&dentry->d_u.d_child); + anon->d_parent = dparent; list_del(&anon->d_u.d_child); - if (!IS_ROOT(anon)) - list_add(&anon->d_u.d_child, &anon->d_parent->d_subdirs); - else - INIT_LIST_HEAD(&anon->d_u.d_child); + list_add(&anon->d_u.d_child, &dparent->d_subdirs); write_seqcount_end(&dentry->d_seq); write_seqcount_end(&anon->d_seq); @@ -2552,7 +2542,6 @@ static int prepend_path(const struct path *path, bool slash = false; int error = 0; - br_read_lock(&vfsmount_lock); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; @@ -2582,8 +2571,6 @@ static int prepend_path(const struct path *path, if (!error && !slash) error = prepend(buffer, buflen, "/", 1); -out: - br_read_unlock(&vfsmount_lock); return error; global_root: @@ -2600,7 +2587,7 @@ global_root: error = prepend(buffer, buflen, "/", 1); if (!error) error = is_mounted(vfsmnt) ? 1 : 2; - goto out; + return error; } /** @@ -2627,9 +2614,11 @@ char *__d_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) return ERR_PTR(error); @@ -2646,9 +2635,11 @@ char *d_absolute_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = prepend_path(path, &root, &res, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error > 1) error = -EINVAL; @@ -2712,47 +2703,18 @@ char *d_path(const struct path *path, char *buf, int buflen) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); get_fs_root(current->fs, &root); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = path_with_deleted(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) res = ERR_PTR(error); - write_sequnlock(&rename_lock); path_put(&root); return res; } EXPORT_SYMBOL(d_path); -/** - * d_path_with_unreachable - return the path of a dentry - * @path: path to report - * @buf: buffer to return value in - * @buflen: buffer length - * - * The difference from d_path() is that this prepends "(unreachable)" - * to paths which are unreachable from the current process' root. - */ -char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) -{ - char *res = buf + buflen; - struct path root; - int error; - - if (path->dentry->d_op && path->dentry->d_op->d_dname) - return path->dentry->d_op->d_dname(path->dentry, buf, buflen); - - get_fs_root(current->fs, &root); - write_seqlock(&rename_lock); - error = path_with_deleted(path, &root, &res, &buflen); - if (error > 0) - error = prepend_unreachable(&res, &buflen); - write_sequnlock(&rename_lock); - path_put(&root); - if (error) - res = ERR_PTR(error); - - return res; -} - /* * Helper function for dentry_operations.d_dname() members */ @@ -2871,6 +2833,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) get_fs_root_and_pwd(current->fs, &root, &pwd); error = -ENOENT; + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; @@ -2880,6 +2843,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) goto out; @@ -2900,6 +2864,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } } else { write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); } out: @@ -3035,7 +3000,7 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name) ino_t ino = 0; dentry = d_hash_and_lookup(dir, name); - if (dentry) { + if (!IS_ERR_OR_NULL(dentry)) { if (dentry->d_inode) ino = dentry->d_inode->i_ino; dput(dentry); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index a5f12b7e228d..4888cb3fdef7 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -299,6 +299,7 @@ static struct file_system_type debug_fs_type = { .mount = debug_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("debugfs"); static struct dentry *__create_file(const char *name, umode_t mode, struct dentry *parent, void *data, @@ -322,7 +323,6 @@ static struct dentry *__create_file(const char *name, umode_t mode, if (!parent) parent = debugfs_mount->mnt_root; - dentry = NULL; mutex_lock(&parent->d_inode->i_mutex); dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry)) { diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 472e6befc54d..073d30b9d1ac 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -243,6 +243,13 @@ static int mknod_ptmx(struct super_block *sb) struct dentry *root = sb->s_root; struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; + kuid_t root_uid; + kgid_t root_gid; + + root_uid = make_kuid(current_user_ns(), 0); + root_gid = make_kgid(current_user_ns(), 0); + if (!uid_valid(root_uid) || !gid_valid(root_gid)) + return -EINVAL; mutex_lock(&root->d_inode->i_mutex); @@ -273,6 +280,8 @@ static int mknod_ptmx(struct super_block *sb) mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); + inode->i_uid = root_uid; + inode->i_gid = root_gid; d_add(dentry, inode); @@ -438,6 +447,12 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type, if (error) return ERR_PTR(error); + /* Require newinstance for all user namespace mounts to ensure + * the mount options are not changed. + */ + if ((current_user_ns() != &init_user_ns) && !opts.newinstance) + return ERR_PTR(-EINVAL); + if (opts.newinstance) s = sget(fs_type, NULL, set_anon_super, flags, NULL); else @@ -491,6 +506,9 @@ static struct file_system_type devpts_fs_type = { .name = "devpts", .mount = devpts_mount, .kill_sb = devpts_kill_sb, +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, +#endif }; /* diff --git a/fs/direct-io.c b/fs/direct-io.c index cf5b44b10c67..f853263cf74f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -261,9 +261,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is dio->end_io(dio->iocb, offset, transferred, dio->private, ret, is_async); } else { + inode_dio_done(dio->inode); if (is_async) aio_complete(dio->iocb, ret, 0); - inode_dio_done(dio->inode); } return ret; diff --git a/fs/dlm/config.c b/fs/dlm/config.c index a0387dd8b1f0..7d58d5b112b5 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -158,7 +158,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, unsigned int x; if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; x = simple_strtoul(buf, NULL, 0); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 77c0f70f8fe8..e7665c31f7b1 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -96,10 +96,13 @@ do { \ } +#define DLM_RTF_SHRINK 0x00000001 + struct dlm_rsbtable { struct rb_root keep; struct rb_root toss; spinlock_t lock; + uint32_t flags; }; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index a579f30f237d..1b1146670c4b 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1132,6 +1132,7 @@ static void toss_rsb(struct kref *kref) rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep); rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss); r->res_toss_time = jiffies; + ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK; if (r->res_lvbptr) { dlm_free_lvb(r->res_lvbptr); r->res_lvbptr = NULL; @@ -1182,7 +1183,7 @@ static void detach_lkb(struct dlm_lkb *lkb) static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) { struct dlm_lkb *lkb; - int rv, id; + int rv; lkb = dlm_allocate_lkb(ls); if (!lkb) @@ -1198,19 +1199,13 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) mutex_init(&lkb->lkb_cb_mutex); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); - retry: - rv = idr_pre_get(&ls->ls_lkbidr, GFP_NOFS); - if (!rv) - return -ENOMEM; - + idr_preload(GFP_NOFS); spin_lock(&ls->ls_lkbidr_spin); - rv = idr_get_new_above(&ls->ls_lkbidr, lkb, 1, &id); - if (!rv) - lkb->lkb_id = id; + rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT); + if (rv >= 0) + lkb->lkb_id = rv; spin_unlock(&ls->ls_lkbidr_spin); - - if (rv == -EAGAIN) - goto retry; + idr_preload_end(); if (rv < 0) { log_error(ls, "create_lkb idr error %d", rv); @@ -1659,11 +1654,18 @@ static void shrink_bucket(struct dlm_ls *ls, int b) char *name; int our_nodeid = dlm_our_nodeid(); int remote_count = 0; + int need_shrink = 0; int i, len, rv; memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX); spin_lock(&ls->ls_rsbtbl[b].lock); + + if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } + for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) { next = rb_next(n); r = rb_entry(n, struct dlm_rsb, res_hashnode); @@ -1679,6 +1681,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b) continue; } + need_shrink = 1; + if (!time_after_eq(jiffies, r->res_toss_time + dlm_config.ci_toss_secs * HZ)) { continue; @@ -1710,6 +1714,11 @@ static void shrink_bucket(struct dlm_ls *ls, int b) rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); dlm_free_rsb(r); } + + if (need_shrink) + ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK; + else + ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK; spin_unlock(&ls->ls_rsbtbl[b].lock); /* diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 2e99fb0c9737..3ca79d3253b9 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -796,7 +796,6 @@ static int release_lockspace(struct dlm_ls *ls, int force) */ idr_for_each(&ls->ls_lkbidr, lkb_idr_free, ls); - idr_remove_all(&ls->ls_lkbidr); idr_destroy(&ls->ls_lkbidr); /* diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index dd87a31bcc21..4f5ad246582f 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -177,12 +177,11 @@ static inline int nodeid_hash(int nodeid) static struct connection *__find_con(int nodeid) { int r; - struct hlist_node *h; struct connection *con; r = nodeid_hash(nodeid); - hlist_for_each_entry(con, h, &connection_hash[r], list) { + hlist_for_each_entry(con, &connection_hash[r], list) { if (con->nodeid == nodeid) return con; } @@ -232,13 +231,12 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc) static void foreach_conn(void (*conn_func)(struct connection *c)) { int i; - struct hlist_node *h, *n; + struct hlist_node *n; struct connection *con; for (i = 0; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry_safe(con, h, n, &connection_hash[i], list){ + hlist_for_each_entry_safe(con, n, &connection_hash[i], list) conn_func(con); - } } } @@ -257,13 +255,12 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation) static struct connection *assoc2con(int assoc_id) { int i; - struct hlist_node *h; struct connection *con; mutex_lock(&connections_lock); for (i = 0 ; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry(con, h, &connection_hash[i], list) { + hlist_for_each_entry(con, &connection_hash[i], list) { if (con->sctp_assoc == assoc_id) { mutex_unlock(&connections_lock); return con; diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index aedea28a86a1..a6bc63f6e31b 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -305,27 +305,26 @@ static int recover_idr_empty(struct dlm_ls *ls) static int recover_idr_add(struct dlm_rsb *r) { struct dlm_ls *ls = r->res_ls; - int rv, id; - - rv = idr_pre_get(&ls->ls_recover_idr, GFP_NOFS); - if (!rv) - return -ENOMEM; + int rv; + idr_preload(GFP_NOFS); spin_lock(&ls->ls_recover_idr_lock); if (r->res_id) { - spin_unlock(&ls->ls_recover_idr_lock); - return -1; - } - rv = idr_get_new_above(&ls->ls_recover_idr, r, 1, &id); - if (rv) { - spin_unlock(&ls->ls_recover_idr_lock); - return rv; + rv = -1; + goto out_unlock; } - r->res_id = id; + rv = idr_alloc(&ls->ls_recover_idr, r, 1, 0, GFP_NOWAIT); + if (rv < 0) + goto out_unlock; + + r->res_id = rv; ls->ls_recover_list_count++; dlm_hold_rsb(r); + rv = 0; +out_unlock: spin_unlock(&ls->ls_recover_idr_lock); - return 0; + idr_preload_end(); + return rv; } static void recover_idr_del(struct dlm_rsb *r) @@ -351,24 +350,21 @@ static struct dlm_rsb *recover_idr_find(struct dlm_ls *ls, uint64_t id) return r; } -static int recover_idr_clear_rsb(int id, void *p, void *data) +static void recover_idr_clear(struct dlm_ls *ls) { - struct dlm_ls *ls = data; - struct dlm_rsb *r = p; + struct dlm_rsb *r; + int id; - r->res_id = 0; - r->res_recover_locks_count = 0; - ls->ls_recover_list_count--; + spin_lock(&ls->ls_recover_idr_lock); - dlm_put_rsb(r); - return 0; -} + idr_for_each_entry(&ls->ls_recover_idr, r, id) { + idr_remove(&ls->ls_recover_idr, id); + r->res_id = 0; + r->res_recover_locks_count = 0; + ls->ls_recover_list_count--; -static void recover_idr_clear(struct dlm_ls *ls) -{ - spin_lock(&ls->ls_recover_idr_lock); - idr_for_each(&ls->ls_recover_idr, recover_idr_clear_rsb, ls); - idr_remove_all(&ls->ls_recover_idr); + dlm_put_rsb(r); + } if (ls->ls_recover_list_count != 0) { log_error(ls, "warning: recover_list_count %d", diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 7ff49852b0cb..911649a47dd5 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -503,11 +503,11 @@ static ssize_t device_write(struct file *file, const char __user *buf, #endif return -EINVAL; -#ifdef CONFIG_COMPAT - if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN) -#else + /* + * can't compare against COMPAT/dlm_write_request32 because + * we don't yet know if is64bit is zero + */ if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) -#endif return -EINVAL; kbuf = kzalloc(count + 1, GFP_NOFS); diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index cc16562654de..434aa313f077 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,6 +1,6 @@ config ECRYPT_FS - tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) + tristate "eCrypt filesystem layer support" + depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) select CRYPTO_ECB select CRYPTO_CBC select CRYPTO_MD5 @@ -12,3 +12,11 @@ config ECRYPT_FS To compile this file system support as a module, choose M here: the module will be called ecryptfs. + +config ECRYPT_FS_MESSAGING + bool "Enable notifications for userspace key wrap/unwrap" + depends on ECRYPT_FS + help + Enables the /dev/ecryptfs entry for use by ecryptfsd. This allows + for userspace to wrap/unwrap file encryption keys by other + backends, like OpenSSL. diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile index 2cc9ee4ad2eb..49678a69947d 100644 --- a/fs/ecryptfs/Makefile +++ b/fs/ecryptfs/Makefile @@ -1,7 +1,10 @@ # -# Makefile for the Linux 2.6 eCryptfs +# Makefile for the Linux eCryptfs # obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o -ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o +ecryptfs-y := dentry.o file.o inode.o main.o super.o mmap.o read_write.o \ + crypto.o keystore.o kthread.o debug.o + +ecryptfs-$(CONFIG_ECRYPT_FS_MESSAGING) += messaging.o miscdev.o diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index a7b0c2dfb3db..d5c25db4398f 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -301,17 +301,14 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, while (size > 0 && i < sg_size) { pg = virt_to_page(addr); offset = offset_in_page(addr); - if (sg) - sg_set_page(&sg[i], pg, 0, offset); + sg_set_page(&sg[i], pg, 0, offset); remainder_of_page = PAGE_CACHE_SIZE - offset; if (size >= remainder_of_page) { - if (sg) - sg[i].length = remainder_of_page; + sg[i].length = remainder_of_page; addr += remainder_of_page; size -= remainder_of_page; } else { - if (sg) - sg[i].length = size; + sg[i].length = size; addr += size; size = 0; } diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 1b5d9af937df..bf12ba5dd223 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -45,14 +45,12 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *lower_dentry; - struct vfsmount *lower_mnt; int rc = 1; if (flags & LOOKUP_RCU) return -ECHILD; lower_dentry = ecryptfs_dentry_to_lower(dentry); - lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) goto out; rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index cfb4b9fed520..dd299b389d4e 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -172,6 +172,19 @@ ecryptfs_get_key_payload_data(struct key *key) #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24 #define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32) +#ifdef CONFIG_ECRYPT_FS_MESSAGING +# define ECRYPTFS_VERSIONING_MASK_MESSAGING (ECRYPTFS_VERSIONING_DEVMISC \ + | ECRYPTFS_VERSIONING_PUBKEY) +#else +# define ECRYPTFS_VERSIONING_MASK_MESSAGING 0 +#endif + +#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ + | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ + | ECRYPTFS_VERSIONING_XATTR \ + | ECRYPTFS_VERSIONING_MULTKEY \ + | ECRYPTFS_VERSIONING_MASK_MESSAGING \ + | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) struct ecryptfs_key_sig { struct list_head crypt_stat_list; char keysig[ECRYPTFS_SIG_SIZE_HEX + 1]; @@ -399,7 +412,9 @@ struct ecryptfs_daemon { struct hlist_node euid_chain; }; +#ifdef CONFIG_ECRYPT_FS_MESSAGING extern struct mutex ecryptfs_daemon_hash_mux; +#endif static inline size_t ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat) @@ -509,6 +524,12 @@ ecryptfs_dentry_to_lower_mnt(struct dentry *dentry) return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt; } +static inline struct path * +ecryptfs_dentry_to_lower_path(struct dentry *dentry) +{ + return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path; +} + static inline void ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt) { @@ -604,6 +625,7 @@ int ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); +#ifdef CONFIG_ECRYPT_FS_MESSAGING int ecryptfs_process_response(struct ecryptfs_daemon *daemon, struct ecryptfs_message *msg, u32 seq); int ecryptfs_send_message(char *data, int data_len, @@ -612,6 +634,24 @@ int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, struct ecryptfs_message **emsg); int ecryptfs_init_messaging(void); void ecryptfs_release_messaging(void); +#else +static inline int ecryptfs_init_messaging(void) +{ + return 0; +} +static inline void ecryptfs_release_messaging(void) +{ } +static inline int ecryptfs_send_message(char *data, int data_len, + struct ecryptfs_msg_ctx **msg_ctx) +{ + return -ENOTCONN; +} +static inline int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, + struct ecryptfs_message **emsg) +{ + return -ENOMSG; +} +#endif void ecryptfs_write_header_metadata(char *virt, @@ -649,12 +689,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, size_t offset_in_page, size_t size, struct inode *ecryptfs_inode); struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); -int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); -int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon); int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, size_t *length_size); int ecryptfs_write_packet_length(char *dest, size_t size, size_t *packet_size_length); +#ifdef CONFIG_ECRYPT_FS_MESSAGING int ecryptfs_init_ecryptfs_miscdev(void); void ecryptfs_destroy_ecryptfs_miscdev(void); int ecryptfs_send_miscdev(char *data, size_t data_size, @@ -663,6 +702,9 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx); int ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file); +int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); +int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon); +#endif int ecryptfs_init_kthread(void); void ecryptfs_destroy_kthread(void); int ecryptfs_privileged_open(struct file **lower_file, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index d45ba4568128..63b1f54b6a1f 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -118,7 +118,7 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) lower_file = ecryptfs_file_to_lower(file); lower_file->f_pos = file->f_pos; - inode = file->f_path.dentry->d_inode; + inode = file_inode(file); memset(&buf, 0, sizeof(buf)); buf.dirent = dirent; buf.dentry = file->f_path.dentry; @@ -133,7 +133,7 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) goto out; if (rc >= 0) fsstack_copy_attr_atime(inode, - lower_file->f_path.dentry->d_inode); + file_inode(lower_file)); out: return rc; } @@ -199,7 +199,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) struct dentry *ecryptfs_dentry = file->f_path.dentry; /* Private value of ecryptfs_dentry allocated in * ecryptfs_lookup() */ - struct dentry *lower_dentry; struct ecryptfs_file_info *file_info; mount_crypt_stat = &ecryptfs_superblock_to_private( @@ -222,7 +221,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) rc = -ENOMEM; goto out; } - lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; mutex_lock(&crypt_stat->cs_mutex); if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index cc7709e7c508..5eab400e2590 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -999,8 +999,8 @@ out: return rc; } -int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) { struct ecryptfs_mount_crypt_stat *mount_crypt_stat; int rc = 0; @@ -1021,14 +1021,13 @@ int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, return rc; } -int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) { struct kstat lower_stat; int rc; - rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry), - ecryptfs_dentry_to_lower(dentry), &lower_stat); + rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat); if (!rc) { fsstack_copy_attr_all(dentry->d_inode, ecryptfs_inode_to_lower(dentry->d_inode)); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 2333203a120b..7d52806c2119 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1150,7 +1150,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_message *msg = NULL; char *auth_tok_sig; char *payload; - size_t payload_len; + size_t payload_len = 0; int rc; rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok); @@ -1168,7 +1168,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); if (rc) { ecryptfs_printk(KERN_ERR, "Error sending message to " - "ecryptfsd\n"); + "ecryptfsd: %d\n", rc); goto out; } rc = ecryptfs_wait_for_response(msg_ctx, &msg); @@ -1202,8 +1202,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, crypt_stat->key_size); } out: - if (msg) - kfree(msg); + kfree(msg); return rc; } @@ -1989,7 +1988,7 @@ pki_encrypt_session_key(struct key *auth_tok_key, rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); if (rc) { ecryptfs_printk(KERN_ERR, "Error sending message to " - "ecryptfsd\n"); + "ecryptfsd: %d\n", rc); goto out; } rc = ecryptfs_wait_for_response(msg_ctx, &msg); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 4e0886c9e5c4..e924cf45aad9 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -629,6 +629,7 @@ static struct file_system_type ecryptfs_fs_type = { .kill_sb = ecryptfs_kill_block_super, .fs_flags = 0 }; +MODULE_ALIAS_FS("ecryptfs"); /** * inode_info_init_once diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 5fa2471796c2..49ff8ea08f1c 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -97,8 +97,7 @@ static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx) void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx) { list_move(&(msg_ctx->node), &ecryptfs_msg_ctx_free_list); - if (msg_ctx->msg) - kfree(msg_ctx->msg); + kfree(msg_ctx->msg); msg_ctx->msg = NULL; msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_FREE; } @@ -115,10 +114,9 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx) */ int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon) { - struct hlist_node *elem; int rc; - hlist_for_each_entry(*daemon, elem, + hlist_for_each_entry(*daemon, &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()], euid_chain) { if (uid_eq((*daemon)->file->f_cred->euid, current_euid())) { @@ -284,7 +282,7 @@ ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, int rc; rc = ecryptfs_find_daemon_by_euid(&daemon); - if (rc || !daemon) { + if (rc) { rc = -ENOTCONN; goto out; } @@ -445,7 +443,6 @@ void ecryptfs_release_messaging(void) mutex_unlock(&ecryptfs_msg_ctx_lists_mux); } if (ecryptfs_daemon_hash) { - struct hlist_node *elem; struct ecryptfs_daemon *daemon; int i; @@ -453,7 +450,7 @@ void ecryptfs_release_messaging(void) for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { int rc; - hlist_for_each_entry(daemon, elem, + hlist_for_each_entry(daemon, &ecryptfs_daemon_hash[i], euid_chain) { rc = ecryptfs_exorcise_daemon(daemon); diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index b2a34a192f4f..6a160539cd23 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -40,16 +40,12 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size) { struct file *lower_file; - mm_segment_t fs_save; ssize_t rc; lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; if (!lower_file) return -EIO; - fs_save = get_fs(); - set_fs(get_ds()); - rc = vfs_write(lower_file, data, size, &offset); - set_fs(fs_save); + rc = kernel_write(lower_file, data, size, offset); mark_inode_dirty_sync(ecryptfs_inode); return rc; } diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig new file mode 100644 index 000000000000..367bbb10c543 --- /dev/null +++ b/fs/efivarfs/Kconfig @@ -0,0 +1,12 @@ +config EFIVAR_FS + tristate "EFI Variable filesystem" + depends on EFI + help + efivarfs is a replacement filesystem for the old EFI + variable support via sysfs, as it doesn't suffer from the + same 1024-byte variable size limit. + + To compile this file system support as a module, choose M + here. The module will be called efivarfs. + + If unsure, say N. diff --git a/fs/efivarfs/Makefile b/fs/efivarfs/Makefile new file mode 100644 index 000000000000..955d478177d5 --- /dev/null +++ b/fs/efivarfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the efivarfs filesystem +# + +obj-$(CONFIG_EFIVAR_FS) += efivarfs.o + +efivarfs-objs := inode.o file.o super.o diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c new file mode 100644 index 000000000000..ede07fc7309f --- /dev/null +++ b/fs/efivarfs/file.c @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/efi.h> +#include <linux/fs.h> + +#include "internal.h" + +static ssize_t efivarfs_file_write(struct file *file, + const char __user *userbuf, size_t count, loff_t *ppos) +{ + struct efivar_entry *var = file->private_data; + void *data; + u32 attributes; + struct inode *inode = file->f_mapping->host; + unsigned long datasize = count - sizeof(attributes); + ssize_t bytes = 0; + bool set = false; + + if (count < sizeof(attributes)) + return -EINVAL; + + if (copy_from_user(&attributes, userbuf, sizeof(attributes))) + return -EFAULT; + + if (attributes & ~(EFI_VARIABLE_MASK)) + return -EINVAL; + + data = kmalloc(datasize, GFP_KERNEL); + if (!data) + return -ENOMEM; + + if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) { + bytes = -EFAULT; + goto out; + } + + bytes = efivar_entry_set_get_size(var, attributes, &datasize, + data, &set); + if (!set && bytes) + goto out; + + if (bytes == -ENOENT) { + drop_nlink(inode); + d_delete(file->f_dentry); + dput(file->f_dentry); + } else { + mutex_lock(&inode->i_mutex); + i_size_write(inode, datasize + sizeof(attributes)); + mutex_unlock(&inode->i_mutex); + } + + bytes = count; + +out: + kfree(data); + + return bytes; +} + +static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct efivar_entry *var = file->private_data; + unsigned long datasize = 0; + u32 attributes; + void *data; + ssize_t size = 0; + int err; + + err = efivar_entry_size(var, &datasize); + if (err) + return err; + + data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + size = efivar_entry_get(var, &attributes, &datasize, + data + sizeof(attributes)); + if (size) + goto out_free; + + memcpy(data, &attributes, sizeof(attributes)); + size = simple_read_from_buffer(userbuf, count, ppos, + data, datasize + sizeof(attributes)); +out_free: + kfree(data); + + return size; +} + +const struct file_operations efivarfs_file_operations = { + .open = simple_open, + .read = efivarfs_file_read, + .write = efivarfs_file_write, + .llseek = no_llseek, +}; diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c new file mode 100644 index 000000000000..640e289d522e --- /dev/null +++ b/fs/efivarfs/inode.c @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/efi.h> +#include <linux/fs.h> +#include <linux/ctype.h> + +#include "internal.h" + +struct inode *efivarfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev) +{ + struct inode *inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_fop = &efivarfs_file_operations; + break; + case S_IFDIR: + inode->i_op = &efivarfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inc_nlink(inode); + break; + } + } + return inode; +} + +/* + * Return true if 'str' is a valid efivarfs filename of the form, + * + * VariableName-12345678-1234-1234-1234-1234567891bc + */ +bool efivarfs_valid_name(const char *str, int len) +{ + static const char dashes[EFI_VARIABLE_GUID_LEN] = { + [8] = 1, [13] = 1, [18] = 1, [23] = 1 + }; + const char *s = str + len - EFI_VARIABLE_GUID_LEN; + int i; + + /* + * We need a GUID, plus at least one letter for the variable name, + * plus the '-' separator + */ + if (len < EFI_VARIABLE_GUID_LEN + 2) + return false; + + /* GUID must be preceded by a '-' */ + if (*(s - 1) != '-') + return false; + + /* + * Validate that 's' is of the correct format, e.g. + * + * 12345678-1234-1234-1234-123456789abc + */ + for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) { + if (dashes[i]) { + if (*s++ != '-') + return false; + } else { + if (!isxdigit(*s++)) + return false; + } + } + + return true; +} + +static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid) +{ + guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]); + guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]); + guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]); + guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]); + guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]); + guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]); + guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]); + guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]); + guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]); + guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]); + guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]); + guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]); + guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]); + guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]); + guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]); + guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]); +} + +static int efivarfs_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + struct inode *inode; + struct efivar_entry *var; + int namelen, i = 0, err = 0; + + if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len)) + return -EINVAL; + + inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0); + if (!inode) + return -ENOMEM; + + var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL); + if (!var) { + err = -ENOMEM; + goto out; + } + + /* length of the variable name itself: remove GUID and separator */ + namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1; + + efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1, + &var->var.VendorGuid); + + for (i = 0; i < namelen; i++) + var->var.VariableName[i] = dentry->d_name.name[i]; + + var->var.VariableName[i] = '\0'; + + inode->i_private = var; + + efivar_entry_add(var, &efivarfs_list); + d_instantiate(dentry, inode); + dget(dentry); +out: + if (err) { + kfree(var); + iput(inode); + } + return err; +} + +static int efivarfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct efivar_entry *var = dentry->d_inode->i_private; + + if (efivar_entry_delete(var)) + return -EINVAL; + + drop_nlink(dentry->d_inode); + dput(dentry); + return 0; +}; + +/* + * Handle negative dentry. + */ +static struct dentry *efivarfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; +} + +const struct inode_operations efivarfs_dir_inode_operations = { + .lookup = efivarfs_lookup, + .unlink = efivarfs_unlink, + .create = efivarfs_create, +}; diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h new file mode 100644 index 000000000000..b5ff16addb7c --- /dev/null +++ b/fs/efivarfs/internal.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef EFIVAR_FS_INTERNAL_H +#define EFIVAR_FS_INTERNAL_H + +#include <linux/list.h> + +extern const struct file_operations efivarfs_file_operations; +extern const struct inode_operations efivarfs_dir_inode_operations; +extern bool efivarfs_valid_name(const char *str, int len); +extern struct inode *efivarfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev); + +extern struct list_head efivarfs_list; + +#endif /* EFIVAR_FS_INTERNAL_H */ diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c new file mode 100644 index 000000000000..34c48f1fcdbc --- /dev/null +++ b/fs/efivarfs/super.c @@ -0,0 +1,267 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ctype.h> +#include <linux/efi.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/pagemap.h> + +#include "internal.h" + +LIST_HEAD(efivarfs_list); + +static void efivarfs_evict_inode(struct inode *inode) +{ + clear_inode(inode); +} + +static const struct super_operations efivarfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = efivarfs_evict_inode, + .show_options = generic_show_options, +}; + +static struct super_block *efivarfs_sb; + +/* + * Compare two efivarfs file names. + * + * An efivarfs filename is composed of two parts, + * + * 1. A case-sensitive variable name + * 2. A case-insensitive GUID + * + * So we need to perform a case-sensitive match on part 1 and a + * case-insensitive match on part 2. + */ +static int efivarfs_d_compare(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, + const struct qstr *name) +{ + int guid = len - EFI_VARIABLE_GUID_LEN; + + if (name->len != len) + return 1; + + /* Case-sensitive compare for the variable name */ + if (memcmp(str, name->name, guid)) + return 1; + + /* Case-insensitive compare for the GUID */ + return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN); +} + +static int efivarfs_d_hash(const struct dentry *dentry, + const struct inode *inode, struct qstr *qstr) +{ + unsigned long hash = init_name_hash(); + const unsigned char *s = qstr->name; + unsigned int len = qstr->len; + + if (!efivarfs_valid_name(s, len)) + return -EINVAL; + + while (len-- > EFI_VARIABLE_GUID_LEN) + hash = partial_name_hash(*s++, hash); + + /* GUID is case-insensitive. */ + while (len--) + hash = partial_name_hash(tolower(*s++), hash); + + qstr->hash = end_name_hash(hash); + return 0; +} + +/* + * Retaining negative dentries for an in-memory filesystem just wastes + * memory and lookup time: arrange for them to be deleted immediately. + */ +static int efivarfs_delete_dentry(const struct dentry *dentry) +{ + return 1; +} + +static struct dentry_operations efivarfs_d_ops = { + .d_compare = efivarfs_d_compare, + .d_hash = efivarfs_d_hash, + .d_delete = efivarfs_delete_dentry, +}; + +static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) +{ + struct dentry *d; + struct qstr q; + int err; + + q.name = name; + q.len = strlen(name); + + err = efivarfs_d_hash(NULL, NULL, &q); + if (err) + return ERR_PTR(err); + + d = d_alloc(parent, &q); + if (d) + return d; + + return ERR_PTR(-ENOMEM); +} + +static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, + unsigned long name_size, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct efivar_entry *entry; + struct inode *inode = NULL; + struct dentry *dentry, *root = sb->s_root; + unsigned long size = 0; + char *name; + int len, i; + int err = -ENOMEM; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return err; + + memcpy(entry->var.VariableName, name16, name_size); + memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t)); + + len = utf16_strlen(entry->var.VariableName); + + /* name, plus '-', plus GUID, plus NUL*/ + name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL); + if (!name) + goto fail; + + for (i = 0; i < len; i++) + name[i] = entry->var.VariableName[i] & 0xFF; + + name[len] = '-'; + + efi_guid_unparse(&entry->var.VendorGuid, name + len + 1); + + name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; + + inode = efivarfs_get_inode(sb, root->d_inode, S_IFREG | 0644, 0); + if (!inode) + goto fail_name; + + dentry = efivarfs_alloc_dentry(root, name); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto fail_inode; + } + + /* copied by the above to local storage in the dentry. */ + kfree(name); + + efivar_entry_size(entry, &size); + efivar_entry_add(entry, &efivarfs_list); + + mutex_lock(&inode->i_mutex); + inode->i_private = entry; + i_size_write(inode, size + sizeof(entry->var.Attributes)); + mutex_unlock(&inode->i_mutex); + d_add(dentry, inode); + + return 0; + +fail_inode: + iput(inode); +fail_name: + kfree(name); +fail: + kfree(entry); + return err; +} + +static int efivarfs_destroy(struct efivar_entry *entry, void *data) +{ + efivar_entry_remove(entry); + kfree(entry); + return 0; +} + +static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode = NULL; + struct dentry *root; + int err; + + efivarfs_sb = sb; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = EFIVARFS_MAGIC; + sb->s_op = &efivarfs_ops; + sb->s_d_op = &efivarfs_d_ops; + sb->s_time_gran = 1; + + inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0); + if (!inode) + return -ENOMEM; + inode->i_op = &efivarfs_dir_inode_operations; + + root = d_make_root(inode); + sb->s_root = root; + if (!root) + return -ENOMEM; + + INIT_LIST_HEAD(&efivarfs_list); + + err = efivar_init(efivarfs_callback, (void *)sb, false, + true, &efivarfs_list); + if (err) + __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); + + return err; +} + +static struct dentry *efivarfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, efivarfs_fill_super); +} + +static void efivarfs_kill_sb(struct super_block *sb) +{ + kill_litter_super(sb); + efivarfs_sb = NULL; + + /* Remove all entries and destroy */ + __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); +} + +static struct file_system_type efivarfs_type = { + .name = "efivarfs", + .mount = efivarfs_mount, + .kill_sb = efivarfs_kill_sb, +}; + +static __init int efivarfs_init(void) +{ + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return 0; + + if (!efivars_kobject()) + return 0; + + return register_filesystem(&efivarfs_type); +} + +MODULE_AUTHOR("Matthew Garrett, Jeremy Kerr"); +MODULE_DESCRIPTION("EFI Variable Filesystem"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_FS("efivarfs"); + +module_init(efivarfs_init); diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig index 6ebfc1c207a8..d020e3c30fea 100644 --- a/fs/efs/Kconfig +++ b/fs/efs/Kconfig @@ -1,6 +1,6 @@ config EFS_FS - tristate "EFS file system support (read only) (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + tristate "EFS file system support (read only)" + depends on BLOCK help EFS is an older file system used for non-ISO9660 CD-ROMs and hard disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer diff --git a/fs/efs/dir.c b/fs/efs/dir.c index 7ee6f7e3a608..055a9e9ca747 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -20,7 +20,7 @@ const struct inode_operations efs_dir_inode_operations = { }; static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct buffer_head *bh; struct efs_dir *dirblock; diff --git a/fs/efs/super.c b/fs/efs/super.c index 2002431ef9a0..c6f57a74a559 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -33,6 +33,7 @@ static struct file_system_type efs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("efs"); static struct pt_types sgi_pt_types[] = { {0x00, "SGI vh"}, diff --git a/fs/exec.c b/fs/exec.c index 20df02c1cc70..a96a4885bbbf 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -123,7 +123,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) goto out; error = -EINVAL; - if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) + if (!S_ISREG(file_inode(file)->i_mode)) goto exit; error = -EACCES; @@ -355,7 +355,7 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len) * flags, permissions, and offset, so we use temporary values. We'll update * them later in setup_arg_pages(). */ -int bprm_mm_init(struct linux_binprm *bprm) +static int bprm_mm_init(struct linux_binprm *bprm) { int err; struct mm_struct *mm = NULL; @@ -764,7 +764,7 @@ struct file *open_exec(const char *name) goto out; err = -EACCES; - if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) + if (!S_ISREG(file_inode(file)->i_mode)) goto exit; if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) @@ -1098,7 +1098,7 @@ EXPORT_SYMBOL(flush_old_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { - if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0) + if (inode_permission(file_inode(file), MAY_READ) < 0) bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; } EXPORT_SYMBOL(would_dump); @@ -1111,7 +1111,7 @@ void setup_new_exec(struct linux_binprm * bprm) current->sas_ss_sp = current->sas_ss_size = 0; if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid())) - set_dumpable(current->mm, SUID_DUMPABLE_ENABLED); + set_dumpable(current->mm, SUID_DUMP_USER); else set_dumpable(current->mm, suid_dumpable); @@ -1270,7 +1270,7 @@ static int check_unsafe_exec(struct linux_binprm *bprm) int prepare_binprm(struct linux_binprm *bprm) { umode_t mode; - struct inode * inode = bprm->file->f_path.dentry->d_inode; + struct inode * inode = file_inode(bprm->file); int retval; mode = inode->i_mode; @@ -1639,17 +1639,17 @@ EXPORT_SYMBOL(set_binfmt); void set_dumpable(struct mm_struct *mm, int value) { switch (value) { - case SUID_DUMPABLE_DISABLED: + case SUID_DUMP_DISABLE: clear_bit(MMF_DUMPABLE, &mm->flags); smp_wmb(); clear_bit(MMF_DUMP_SECURELY, &mm->flags); break; - case SUID_DUMPABLE_ENABLED: + case SUID_DUMP_USER: set_bit(MMF_DUMPABLE, &mm->flags); smp_wmb(); clear_bit(MMF_DUMP_SECURELY, &mm->flags); break; - case SUID_DUMPABLE_SAFE: + case SUID_DUMP_ROOT: set_bit(MMF_DUMP_SECURELY, &mm->flags); smp_wmb(); set_bit(MMF_DUMPABLE, &mm->flags); @@ -1662,7 +1662,7 @@ int __get_dumpable(unsigned long mm_flags) int ret; ret = mm_flags & MMF_DUMPABLE_MASK; - return (ret > SUID_DUMPABLE_ENABLED) ? SUID_DUMPABLE_SAFE : ret; + return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret; } int get_dumpable(struct mm_struct *mm) diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index c61e62ac231c..46375896cfc0 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -242,7 +242,7 @@ static int exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) { loff_t pos = filp->f_pos; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 5e59280d42d7..9d9763328734 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -1010,6 +1010,7 @@ static struct file_system_type exofs_type = { .mount = exofs_mount, .kill_sb = generic_shutdown_super, }; +MODULE_ALIAS_FS("exofs"); static int __init init_exofs(void) { diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 5df4bb4aab14..262fc9940982 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -44,14 +44,13 @@ find_acceptable_alias(struct dentry *result, { struct dentry *dentry, *toput = NULL; struct inode *inode; - struct hlist_node *p; if (acceptable(context, result)) return result; inode = result->d_inode; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { dget(dentry); spin_unlock(&inode->i_lock); if (toput) diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 2616d0ea5c5c..9f9992b37924 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -159,15 +159,6 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) return bh; } -static void release_blocks(struct super_block *sb, int count) -{ - if (count) { - struct ext2_sb_info *sbi = EXT2_SB(sb); - - percpu_counter_add(&sbi->s_freeblocks_counter, count); - } -} - static void group_adjust_blocks(struct super_block *sb, int group_no, struct ext2_group_desc *desc, struct buffer_head *bh, int count) { @@ -568,8 +559,11 @@ do_more: } error_return: brelse(bitmap_bh); - release_blocks(sb, freed); - dquot_free_block_nodirty(inode, freed); + if (freed) { + percpu_counter_add(&sbi->s_freeblocks_counter, freed); + dquot_free_block_nodirty(inode, freed); + mark_inode_dirty(inode); + } } /** @@ -1239,10 +1233,6 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, *errp = -ENOSPC; sb = inode->i_sb; - if (!sb) { - printk("ext2_new_blocks: nonexistent device"); - return 0; - } /* * Check quota for allocation of this block. @@ -1416,9 +1406,11 @@ allocated: *errp = 0; brelse(bitmap_bh); - dquot_free_block_nodirty(inode, *count-num); - mark_inode_dirty(inode); - *count = num; + if (num < *count) { + dquot_free_block_nodirty(inode, *count-num); + mark_inode_dirty(inode); + *count = num; + } return ret_block; io_error: diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 0f4f5c929257..4237722bfd27 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -290,7 +290,7 @@ static int ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) { loff_t pos = filp->f_pos; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 8f370e012e61..7cadd823bb31 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -118,7 +118,6 @@ void ext2_free_inode (struct inode * inode) * as writing the quota to disk may need the lock as well. */ /* Quota is already initialized in iput() */ - ext2_xattr_delete_inode(inode); dquot_free_inode(inode); dquot_drop(inode); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 6363ac66fafa..fe60cc1117d8 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -34,6 +34,7 @@ #include "ext2.h" #include "acl.h" #include "xip.h" +#include "xattr.h" static int __ext2_write_inode(struct inode *inode, int do_sync); @@ -88,6 +89,7 @@ void ext2_evict_inode(struct inode * inode) inode->i_size = 0; if (inode->i_blocks) ext2_truncate_blocks(inode, 0); + ext2_xattr_delete_inode(inode); } invalidate_inode_buffers(inode); @@ -495,6 +497,10 @@ static int ext2_alloc_branch(struct inode *inode, * parent to disk. */ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } branch[n].bh = bh; lock_buffer(bh); memset(bh->b_data, 0, blocksize); @@ -523,6 +529,14 @@ static int ext2_alloc_branch(struct inode *inode, } *blks = num; return err; + +failed: + for (i = 1; i < n; i++) + bforget(branch[i].bh); + for (i = 0; i < indirect_blks; i++) + ext2_free_blocks(inode, new_blocks[i], 1); + ext2_free_blocks(inode, new_blocks[i], num); + return err; } /** diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 2de655f5d625..5d46c09863f0 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -19,7 +19,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct ext2_inode_info *ei = EXT2_I(inode); unsigned int flags; unsigned short rsv_window_size; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index fa04d023177e..288534920fe5 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1500,7 +1500,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, bh = sb_bread(sb, tmp_bh.b_blocknr); else bh = sb_getblk(sb, tmp_bh.b_blocknr); - if (!bh) { + if (unlikely(!bh)) { err = -EIO; goto out; } @@ -1536,6 +1536,7 @@ static struct file_system_type ext2_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext2"); static int __init init_ext2_fs(void) { diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index b6754dbbce3c..2d7557db3ae8 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -662,10 +662,10 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, ea_idebug(inode, "creating block %d", block); new_bh = sb_getblk(sb, block); - if (!new_bh) { + if (unlikely(!new_bh)) { ext2_free_blocks(inode, block, 1); mark_inode_dirty(inode); - error = -EIO; + error = -ENOMEM; goto cleanup; } lock_buffer(new_bh); diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index dd91264ba94f..87eccbbca255 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -99,7 +99,7 @@ static int ext3_readdir(struct file * filp, int i, stored; struct ext3_dir_entry_2 *de; int err; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; int ret = 0; int dir_has_error = 0; @@ -114,7 +114,7 @@ static int ext3_readdir(struct file * filp, * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; + EXT3_I(file_inode(filp))->i_flags &= ~EXT3_INDEX_FL; } stored = 0; offset = filp->f_pos & (sb->s_blocksize - 1); @@ -457,7 +457,7 @@ static int call_filldir(struct file * filp, void * dirent, { struct dir_private_info *info = filp->private_data; loff_t curr_pos; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block * sb; int error; @@ -487,7 +487,7 @@ static int ext3_dx_readdir(struct file * filp, void * dirent, filldir_t filldir) { struct dir_private_info *info = filp->private_data; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct fname *fname; int ret; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index b176d4253544..d512c4bc4ad7 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -676,6 +676,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode, * parent to disk. */ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } branch[n].bh = bh; lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); @@ -717,7 +721,7 @@ failed: BUFFER_TRACE(branch[i].bh, "call journal_forget"); ext3_journal_forget(handle, branch[i].bh); } - for (i = 0; i <indirect_blks; i++) + for (i = 0; i < indirect_blks; i++) ext3_free_blocks(handle, inode, new_blocks[i], 1); ext3_free_blocks(handle, inode, new_blocks[i], num); @@ -1078,8 +1082,8 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, if (!err && buffer_mapped(&dummy)) { struct buffer_head *bh; bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (!bh) { - *errp = -EIO; + if (unlikely(!bh)) { + *errp = -ENOMEM; goto err; } if (buffer_new(&dummy)) { @@ -2729,12 +2733,12 @@ static int __ext3_get_inode_loc(struct inode *inode, return -EIO; bh = sb_getblk(inode->i_sb, block); - if (!bh) { + if (unlikely(!bh)) { ext3_error (inode->i_sb, "ext3_get_inode_loc", "unable to read inode block - " "inode=%lu, block="E3FSBLK, inode->i_ino, block); - return -EIO; + return -ENOMEM; } if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -2783,7 +2787,7 @@ static int __ext3_get_inode_loc(struct inode *inode, bitmap_bh = sb_getblk(inode->i_sb, le32_to_cpu(desc->bg_inode_bitmap)); - if (!bitmap_bh) + if (unlikely(!bitmap_bh)) goto make_io; /* diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 677a5c27dc69..4d96e9a64532 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -14,7 +14,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct ext3_inode_info *ei = EXT3_I(inode); unsigned int flags; unsigned short rsv_window_size; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 890b8947c546..692de13e3596 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -36,7 +36,6 @@ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) static struct buffer_head *ext3_append(handle_t *handle, struct inode *inode, @@ -624,7 +623,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); - dir = dir_file->f_path.dentry->d_inode; + dir = file_inode(dir_file); if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) @@ -638,7 +637,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, } hinfo.hash = start_hash; hinfo.minor_hash = 0; - frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); + frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err); if (!frame) return err; diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 0f814f3450de..27105655502c 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -116,8 +116,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, int err; bh = sb_getblk(sb, blk); - if (!bh) - return ERR_PTR(-EIO); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); if ((err = ext3_journal_get_write_access(handle, bh))) { brelse(bh); bh = ERR_PTR(err); @@ -234,8 +234,8 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; gdb = sb_getblk(sb, block); - if (!gdb) { - err = -EIO; + if (unlikely(!gdb)) { + err = -ENOMEM; goto exit_bh; } if ((err = ext3_journal_get_write_access(handle, gdb))) { @@ -722,8 +722,8 @@ static void update_backups(struct super_block *sb, break; bh = sb_getblk(sb, group * bpg + blk_off); - if (!bh) { - err = -EIO; + if (unlikely(!bh)) { + err = -ENOMEM; break; } ext3_debug("update metadata backup %#04lx\n", diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 6e50223b3299..fb5120a5505c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -353,7 +353,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) return bdev; fail: - ext3_msg(sb, "error: failed to open journal device %s: %ld", + ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld", __bdevname(dev, b), PTR_ERR(bdev)); return NULL; @@ -887,7 +887,7 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) /*todo: use simple_strtoll with >32bit ext3 */ sb_block = simple_strtoul(options, &options, 0); if (*options && *options != ',') { - ext3_msg(sb, "error: invalid sb specification: %s", + ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s", (char *) *data); return 1; } @@ -916,21 +916,24 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "Not enough memory for storing quotafile name"); return 0; } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext3_msg(sb, KERN_ERR, - "%s quota file already specified", QTYPE2NAME(qtype)); + if (sbi->s_qf_names[qtype]) { + int same = !strcmp(sbi->s_qf_names[qtype], qname); + kfree(qname); - return 0; + if (!same) { + ext3_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + } + return same; } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { + if (strchr(qname, '/')) { ext3_msg(sb, KERN_ERR, "quotafile must be on filesystem root"); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + kfree(qname); return 0; } + sbi->s_qf_names[qtype] = qname; set_opt(sbi->s_mount_opt, QUOTA); return 1; } @@ -945,11 +948,10 @@ static int clear_qf_name(struct super_block *sb, int qtype) { " when quota turned on"); return 0; } - /* - * The space will be released later when all options are confirmed - * to be correct - */ - sbi->s_qf_names[qtype] = NULL; + if (sbi->s_qf_names[qtype]) { + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + } return 1; } #endif @@ -2065,6 +2067,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); + sb->s_flags |= MS_SNAP_STABLE; return 0; @@ -2605,7 +2608,18 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) - old_opts.s_qf_names[i] = sbi->s_qf_names[i]; + if (sbi->s_qf_names[i]) { + old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!old_opts.s_qf_names[i]) { + int j; + + for (j = 0; j < i; j++) + kfree(old_opts.s_qf_names[j]); + return -ENOMEM; + } + } else + old_opts.s_qf_names[i] = NULL; #endif /* @@ -2698,9 +2712,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - if (old_opts.s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(old_opts.s_qf_names[i]); + kfree(old_opts.s_qf_names[i]); #endif if (enable_quota) dquot_resume(sb, -1); @@ -2714,9 +2726,7 @@ restore_opts: #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(sbi->s_qf_names[i]); + kfree(sbi->s_qf_names[i]); sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif @@ -3058,6 +3068,7 @@ static struct file_system_type ext3_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext3"); static int __init init_ext3_fs(void) { diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index d22ebb7a4f55..b1fc96383e08 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -813,10 +813,10 @@ inserted: ea_idebug(inode, "creating block %d", block); new_bh = sb_getblk(sb, block); - if (!new_bh) { + if (unlikely(!new_bh)) { getblk_failed: ext3_free_blocks(handle, inode, block, 1); - error = -EIO; + error = -ENOMEM; goto cleanup; } lock_buffer(new_bh); diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index e6e0d988439b..39a54a0e9fe4 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -324,8 +324,8 @@ ext4_acl_chmod(struct inode *inode) if (error) return error; retry: - handle = ext4_journal_start(inode, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + handle = ext4_journal_start(inode, EXT4_HT_XATTR, + ext4_jbd2_credits_xattr(inode)); if (IS_ERR(handle)) { error = PTR_ERR(handle); ext4_std_error(inode->i_sb, error); @@ -422,7 +422,8 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, acl = NULL; retry: - handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + handle = ext4_journal_start(inode, EXT4_HT_XATTR, + ext4_jbd2_credits_xattr(inode)); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto release_and_out; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index cf1821784a16..92e68b33fffd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -358,7 +358,7 @@ void ext4_validate_block_bitmap(struct super_block *sb, } /** - * ext4_read_block_bitmap() + * ext4_read_block_bitmap_nowait() * @sb: super block * @block_group: given block group * @@ -457,6 +457,8 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) struct buffer_head *bh; bh = ext4_read_block_bitmap_nowait(sb, block_group); + if (!bh) + return NULL; if (ext4_wait_block_bitmap(sb, block_group, bh)) { put_bh(bh); return NULL; @@ -482,11 +484,16 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); - root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es)); + + /* + * r_blocks_count should always be multiple of the cluster ratio so + * we are safe to do a plane bit shift only. + */ + root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; if (free_clusters - (nclusters + root_clusters + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { - free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc)); + free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); } /* Check whether we have space after accounting for current @@ -628,7 +635,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" ", computed = %llu, %llu\n", - EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), + EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 80a28b297279..d8cd1f0f4661 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -110,7 +110,7 @@ static int ext4_readdir(struct file *filp, int i, stored; struct ext4_dir_entry_2 *de; int err; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; int ret = 0; int dir_has_error = 0; @@ -133,7 +133,7 @@ static int ext4_readdir(struct file *filp, * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - ext4_clear_inode_flag(filp->f_path.dentry->d_inode, + ext4_clear_inode_flag(file_inode(filp), EXT4_INODE_INDEX); } stored = 0; @@ -185,6 +185,7 @@ static int ext4_readdir(struct file *filp, "at offset %llu", (unsigned long long)filp->f_pos); filp->f_pos += sb->s_blocksize - offset; + brelse(bh); continue; } set_buffer_verified(bh); @@ -333,7 +334,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp) * * For non-htree, ext4_llseek already chooses the proper max offset. */ -loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) +static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); @@ -494,7 +495,7 @@ static int call_filldir(struct file *filp, void *dirent, { struct dir_private_info *info = filp->private_data; loff_t curr_pos; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb; int error; @@ -526,7 +527,7 @@ static int ext4_dx_readdir(struct file *filp, void *dirent, filldir_t filldir) { struct dir_private_info *info = filp->private_data; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct fname *fname; int ret; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8462eb3c33aa..3b83cd604796 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -194,8 +194,7 @@ struct mpage_da_data { */ #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_ERROR 0x0002 -#define EXT4_IO_END_QUEUED 0x0004 -#define EXT4_IO_END_DIRECT 0x0008 +#define EXT4_IO_END_DIRECT 0x0004 struct ext4_io_page { struct page *p_page; @@ -215,10 +214,8 @@ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ - struct page *page; /* for writepage() path */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ - struct work_struct work; /* data work queue */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ int num_io_pages; /* for writepages() */ @@ -338,9 +335,9 @@ struct ext4_group_desc */ struct flex_groups { - atomic_t free_inodes; - atomic_t free_clusters; - atomic_t used_dirs; + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ @@ -582,6 +579,8 @@ enum { #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 /* Do not take i_data_sem locking in ext4_map_blocks */ #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 + /* Do not put hole in extent cache */ +#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 /* * Flags used by ext4_free_blocks @@ -810,17 +809,6 @@ do { \ #endif /* defined(__KERNEL__) || defined(__linux__) */ -/* - * storage for cached extent - * If ec_len == 0, then the cache is invalid. - * If ec_start == 0, then the cache represents a gap (null mapping) - */ -struct ext4_ext_cache { - ext4_fsblk_t ec_start; - ext4_lblk_t ec_block; - __u32 ec_len; /* must be 32bit to return holes */ -}; - #include "extents_status.h" /* @@ -887,7 +875,6 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode *jinode; - struct ext4_ext_cache i_cached_extent; /* * File creation time. Its function is same as that of * struct timespec i_{a,c,m}time in the generic inode. @@ -901,6 +888,8 @@ struct ext4_inode_info { /* extents status tree */ struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; + struct list_head i_es_lru; + unsigned int i_es_lru_nr; /* protected by i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; @@ -930,6 +919,7 @@ struct ext4_inode_info { spinlock_t i_completed_io_lock; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + struct work_struct i_unwritten_work; /* deferred extent conversion */ spinlock_t i_block_reservation_lock; @@ -985,7 +975,6 @@ struct ext4_inode_info { #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ -#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ @@ -1316,6 +1305,12 @@ struct ext4_sb_info { /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_lru; + struct percpu_counter s_extent_cache_cnt; + spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -2007,9 +2002,20 @@ extern int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ -extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t, - const struct qstr *qstr, __u32 goal, - uid_t *owner); +extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, int handle_type, + unsigned int line_no, int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \ + __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ + 0, 0, 0) +#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ + (type), __LINE__, (nblocks)) + + extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); @@ -2103,6 +2109,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern void ext4_ind_truncate(struct inode *inode); +extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -2151,6 +2158,8 @@ extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern void ext4_kvfree(void *ptr); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, const char *, ...); @@ -2227,6 +2236,8 @@ extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); static inline int ext4_has_group_desc_csum(struct super_block *sb) { @@ -2454,6 +2465,75 @@ extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); extern void ext4_unwritten_wait(struct inode *inode); +/* inline.c */ +extern int ext4_has_inline_data(struct inode *inode); +extern int ext4_get_inline_size(struct inode *inode); +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern void ext4_write_inline_data(struct inode *inode, + struct ext4_iloc *iloc, + void *buffer, loff_t pos, + unsigned int len); +extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + void *dirent, filldir_t filldir, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern int empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline); +extern int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed); +extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; @@ -2520,6 +2600,9 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end); extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); @@ -2534,9 +2617,10 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, extern int __init ext4_init_pageio(void); extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); -extern void ext4_ioend_wait(struct inode *); +extern void ext4_ioend_shutdown(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern void ext4_end_io_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 487fda12bc00..8643ff5bbeb7 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -193,12 +193,6 @@ static inline unsigned short ext_depth(struct inode *inode) return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } -static inline void -ext4_ext_invalidate_cache(struct inode *inode) -{ - EXT4_I(inode)->i_cached_extent.ec_len = 0; -} - static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) { /* We can not have an uninitialized extent of zero length! */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b4323ba846b5..7058975e3a55 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -6,6 +6,108 @@ #include <trace/events/ext4.h> +/* Just increment the non-pointer handle value */ +static handle_t *ext4_get_nojournal(void) +{ + handle_t *handle = current->journal_info; + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); + + ref_cnt++; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; + return handle; +} + + +/* Decrement the non-pointer handle value */ +static void ext4_put_nojournal(handle_t *handle) +{ + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt == 0); + + ref_cnt--; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; +} + +/* + * Wrappers for jbd2_journal_start/end. + */ +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, + int type, int nblocks) +{ + journal_t *journal; + + trace_ext4_journal_start(sb, nblocks, _RET_IP_); + if (sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); + journal = EXT4_SB(sb)->s_journal; + if (!journal) + return ext4_get_nojournal(); + /* + * Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. + */ + if (is_journal_aborted(journal)) { + ext4_abort(sb, "Detected aborted journal"); + return ERR_PTR(-EROFS); + } + return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); +} + +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) +{ + struct super_block *sb; + int err; + int rc; + + if (!ext4_handle_valid(handle)) { + ext4_put_nojournal(handle); + return 0; + } + sb = handle->h_transaction->t_journal->j_private; + err = handle->h_err; + rc = jbd2_journal_stop(handle); + + if (!err) + err = rc; + if (err) + __ext4_std_error(sb, where, line, err); + return err; +} + +void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, struct buffer_head *bh, + handle_t *handle, int err) +{ + char nbuf[16]; + const char *errstr = ext4_decode_error(NULL, err, nbuf); + + BUG_ON(!ext4_handle_valid(handle)); + + if (bh) + BUFFER_TRACE(bh, "abort"); + + if (!handle->h_err) + handle->h_err = err; + + if (is_handle_aborted(handle)) + return; + + printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", + caller, line, errstr, err_fn); + + jbd2_journal_abort_handle(handle); +} + int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 7177f9b21cb2..4c216b1bf20c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -59,12 +59,6 @@ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) -/* Delete operations potentially hit one directory's namespace plus an - * entire inode, plus arbitrary amounts of bitmap/indirection data. Be - * generous. We can grow the delete transaction later if necessary. */ - -#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64) - /* Define an arbitrary limit for the amount of data we will anticipate * writing to any given transaction. For unbounded transactions such as * write(2) and truncate(2) we can write more than this, but we always @@ -110,6 +104,36 @@ #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) +static inline int ext4_jbd2_credits_xattr(struct inode *inode) +{ + int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); + + /* + * In case of inline data, we may push out the data to a block, + * so we need to reserve credits for this eventuality + */ + if (ext4_has_inline_data(inode)) + credits += ext4_writepage_trans_blocks(inode) + 1; + return credits; +} + + +/* + * Ext4 handle operation types -- for logging purposes + */ +#define EXT4_HT_MISC 0 +#define EXT4_HT_INODE 1 +#define EXT4_HT_WRITE_PAGE 2 +#define EXT4_HT_MAP_BLOCKS 3 +#define EXT4_HT_DIR 4 +#define EXT4_HT_TRUNCATE 5 +#define EXT4_HT_QUOTA 6 +#define EXT4_HT_RESIZE 7 +#define EXT4_HT_MIGRATE 8 +#define EXT4_HT_MOVE_EXTENTS 9 +#define EXT4_HT_XATTR 10 +#define EXT4_HT_MAX 11 + /** * struct ext4_journal_cb_entry - Base structure for callback information. * @@ -234,7 +258,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, #define ext4_handle_dirty_super(handle, sb) \ __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) -handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, + int type, int nblocks); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -268,9 +293,17 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) return 1; } -static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) +#define ext4_journal_start_sb(sb, type, nblocks) \ + __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) + +#define ext4_journal_start(inode, type, nblocks) \ + __ext4_journal_start((inode), __LINE__, (type), (nblocks)) + +static inline handle_t *__ext4_journal_start(struct inode *inode, + unsigned int line, int type, + int nblocks) { - return ext4_journal_start_sb(inode->i_sb, nblocks); + return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks); } #define ext4_journal_stop(handle) \ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5ae1674ec12f..56efcaadf848 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -112,7 +112,7 @@ static int ext4_split_extent_at(handle_t *handle, int flags); static int ext4_find_delayed_extent(struct inode *inode, - struct ext4_ext_cache *newex); + struct extent_status *newes); static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, @@ -714,7 +714,6 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); ext4_mark_inode_dirty(handle, inode); - ext4_ext_invalidate_cache(inode); return 0; } @@ -725,6 +724,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, struct ext4_extent_header *eh; struct buffer_head *bh; short int depth, i, ppos = 0, alloc = 0; + int ret; eh = ext_inode_hdr(inode); depth = ext_depth(inode); @@ -752,12 +752,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_ext = NULL; bh = sb_getblk(inode->i_sb, path[ppos].p_block); - if (unlikely(!bh)) + if (unlikely(!bh)) { + ret = -ENOMEM; goto err; + } if (!bh_uptodate_or_lock(bh)) { trace_ext4_ext_load_extent(inode, block, path[ppos].p_block); - if (bh_submit_read(bh) < 0) { + ret = bh_submit_read(bh); + if (ret < 0) { put_bh(bh); goto err; } @@ -768,13 +771,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, put_bh(bh); EXT4_ERROR_INODE(inode, "ppos %d > depth %d", ppos, depth); + ret = -EIO; goto err; } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; i--; - if (ext4_ext_check_block(inode, eh, i, bh)) + ret = ext4_ext_check_block(inode, eh, i, bh); + if (ret < 0) goto err; } @@ -796,7 +801,7 @@ err: ext4_ext_drop_refs(path); if (alloc) kfree(path); - return ERR_PTR(-EIO); + return ERR_PTR(ret); } /* @@ -950,8 +955,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, goto cleanup; } bh = sb_getblk(inode->i_sb, newblock); - if (!bh) { - err = -EIO; + if (unlikely(!bh)) { + err = -ENOMEM; goto cleanup; } lock_buffer(bh); @@ -1023,8 +1028,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, oldblock = newblock; newblock = ablocks[--a]; bh = sb_getblk(inode->i_sb, newblock); - if (!bh) { - err = -EIO; + if (unlikely(!bh)) { + err = -ENOMEM; goto cleanup; } lock_buffer(bh); @@ -1136,11 +1141,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, return err; bh = sb_getblk(inode->i_sb, newblock); - if (!bh) { - err = -EIO; - ext4_std_error(inode->i_sb, err); - return err; - } + if (unlikely(!bh)) + return -ENOMEM; lock_buffer(bh); err = ext4_journal_get_create_access(handle, bh); @@ -1582,10 +1584,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, unsigned short ext1_ee_len, ext2_ee_len, max_len; /* - * Make sure that either both extents are uninitialized, or - * both are _not_. + * Make sure that both extents are initialized. We don't merge + * uninitialized extents so that we can be sure that end_io code has + * the extent that was written properly split out and conversion to + * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) return 0; if (ext4_ext_is_uninitialized(ex1)) @@ -1960,7 +1964,6 @@ cleanup: ext4_ext_drop_refs(npath); kfree(npath); } - ext4_ext_invalidate_cache(inode); return err; } @@ -1969,8 +1972,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode, struct fiemap_extent_info *fieinfo) { struct ext4_ext_path *path = NULL; - struct ext4_ext_cache newex; struct ext4_extent *ex; + struct extent_status es; ext4_lblk_t next, next_del, start = 0, end = 0; ext4_lblk_t last = block + num; int exists, depth = 0, err = 0; @@ -2044,37 +2047,47 @@ static int ext4_fill_fiemap_extents(struct inode *inode, BUG_ON(end <= start); if (!exists) { - newex.ec_block = start; - newex.ec_len = end - start; - newex.ec_start = 0; + es.es_lblk = start; + es.es_len = end - start; + es.es_pblk = 0; } else { - newex.ec_block = le32_to_cpu(ex->ee_block); - newex.ec_len = ext4_ext_get_actual_len(ex); - newex.ec_start = ext4_ext_pblock(ex); + es.es_lblk = le32_to_cpu(ex->ee_block); + es.es_len = ext4_ext_get_actual_len(ex); + es.es_pblk = ext4_ext_pblock(ex); if (ext4_ext_is_uninitialized(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; } /* - * Find delayed extent and update newex accordingly. We call - * it even in !exists case to find out whether newex is the + * Find delayed extent and update es accordingly. We call + * it even in !exists case to find out whether es is the * last existing extent or not. */ - next_del = ext4_find_delayed_extent(inode, &newex); + next_del = ext4_find_delayed_extent(inode, &es); if (!exists && next_del) { exists = 1; flags |= FIEMAP_EXTENT_DELALLOC; } up_read(&EXT4_I(inode)->i_data_sem); - if (unlikely(newex.ec_len == 0)) { - EXT4_ERROR_INODE(inode, "newex.ec_len == 0"); + if (unlikely(es.es_len == 0)) { + EXT4_ERROR_INODE(inode, "es.es_len == 0"); err = -EIO; break; } - /* This is possible iff next == next_del == EXT_MAX_BLOCKS */ - if (next == next_del) { + /* + * This is possible iff next == next_del == EXT_MAX_BLOCKS. + * we need to check next == EXT_MAX_BLOCKS because it is + * possible that an extent is with unwritten and delayed + * status due to when an extent is delayed allocated and + * is allocated by fallocate status tree will track both of + * them in a extent. + * + * So we could return a unwritten and delayed extent, and + * its block is equal to 'next'. + */ + if (next == next_del && next == EXT_MAX_BLOCKS) { flags |= FIEMAP_EXTENT_LAST; if (unlikely(next_del != EXT_MAX_BLOCKS || next != EXT_MAX_BLOCKS)) { @@ -2089,9 +2102,9 @@ static int ext4_fill_fiemap_extents(struct inode *inode, if (exists) { err = fiemap_fill_next_extent(fieinfo, - (__u64)newex.ec_block << blksize_bits, - (__u64)newex.ec_start << blksize_bits, - (__u64)newex.ec_len << blksize_bits, + (__u64)es.es_lblk << blksize_bits, + (__u64)es.es_pblk << blksize_bits, + (__u64)es.es_len << blksize_bits, flags); if (err < 0) break; @@ -2101,7 +2114,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, } } - block = newex.ec_block + newex.ec_len; + block = es.es_lblk + es.es_len; } if (path) { @@ -2112,21 +2125,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode, return err; } -static void -ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, - __u32 len, ext4_fsblk_t start) -{ - struct ext4_ext_cache *cex; - BUG_ON(len == 0); - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - trace_ext4_ext_put_in_cache(inode, block, len, start); - cex = &EXT4_I(inode)->i_cached_extent; - cex->ec_block = block; - cex->ec_len = len; - cex->ec_start = start; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); -} - /* * ext4_ext_put_gap_in_cache: * calculate boundaries of the gap that the requested block fits into @@ -2143,9 +2141,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ex = path[depth].p_ext; if (ex == NULL) { - /* there is no extent yet, so gap is [0;-] */ - lblock = 0; - len = EXT_MAX_BLOCKS; + /* + * there is no extent yet, so gap is [0;-] and we + * don't cache it + */ ext_debug("cache gap(whole file):"); } else if (block < le32_to_cpu(ex->ee_block)) { lblock = block; @@ -2154,6 +2153,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block, le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); + if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) + ext4_es_insert_extent(inode, lblock, len, ~0, + EXTENT_STATUS_HOLE); } else if (block >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; @@ -2167,58 +2169,15 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block); BUG_ON(next == lblock); len = next - lblock; + if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) + ext4_es_insert_extent(inode, lblock, len, ~0, + EXTENT_STATUS_HOLE); } else { lblock = len = 0; BUG(); } ext_debug(" -> %u:%lu\n", lblock, len); - ext4_ext_put_in_cache(inode, lblock, len, 0); -} - -/* - * ext4_ext_in_cache() - * Checks to see if the given block is in the cache. - * If it is, the cached extent is stored in the given - * cache extent pointer. - * - * @inode: The files inode - * @block: The block to look for in the cache - * @ex: Pointer where the cached extent will be stored - * if it contains block - * - * Return 0 if cache is invalid; 1 if the cache is valid - */ -static int -ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_extent *ex) -{ - struct ext4_ext_cache *cex; - int ret = 0; - - /* - * We borrow i_block_reservation_lock to protect i_cached_extent - */ - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - cex = &EXT4_I(inode)->i_cached_extent; - - /* has cache valid data? */ - if (cex->ec_len == 0) - goto errout; - - if (in_range(block, cex->ec_block, cex->ec_len)) { - ex->ee_block = cpu_to_le32(cex->ec_block); - ext4_ext_store_pblock(ex, cex->ec_start); - ex->ee_len = cpu_to_le16(cex->ec_len); - ext_debug("%u cached by %u:%u:%llu\n", - block, - cex->ec_block, cex->ec_len, cex->ec_start); - ret = 1; - } -errout: - trace_ext4_ext_in_cache(inode, block, ret); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return ret; } /* @@ -2653,13 +2612,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ - handle = ext4_journal_start(inode, depth + 1); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1); if (IS_ERR(handle)) return PTR_ERR(handle); again: - ext4_ext_invalidate_cache(inode); - trace_ext4_ext_remove_space(inode, start, depth); /* @@ -2968,7 +2925,7 @@ static int ext4_split_extent_at(handle_t *handle, { ext4_fsblk_t newblock; ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; @@ -2988,6 +2945,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + BUG_ON(!ext4_ext_is_uninitialized(ex) && + split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3035,12 +2996,26 @@ static int ext4_split_extent_at(handle_t *handle, err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { - if (split_flag & EXT4_EXT_DATA_VALID1) + if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); - else + zero_ex.ee_block = ex2->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex2); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex2)); + } else { err = ext4_ext_zeroout(inode, ex); - } else + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + } + } else { err = ext4_ext_zeroout(inode, &orig_ex); + zero_ex.ee_block = orig_ex.ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(&orig_ex)); + } if (err) goto fix_extent_len; @@ -3048,6 +3023,12 @@ static int ext4_split_extent_at(handle_t *handle, ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto fix_extent_len; + + /* update extent status tree */ + err = ext4_es_zeroout(inode, &zero_ex); + goto out; } else if (err) goto fix_extent_len; @@ -3086,6 +3067,7 @@ static int ext4_split_extent(handle_t *handle, int err = 0; int uninitialized; int split_flag1, flags1; + int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3105,20 +3087,29 @@ static int ext4_split_extent(handle_t *handle, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; + } else { + allocated = ee_len - (map->m_lblk - ee_block); } - + /* + * Update path is required because previous ext4_split_extent_at() may + * result in split of original leaf or extent zeroout. + */ ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, map->m_lblk, path); if (IS_ERR(path)) return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + uninitialized = ext4_ext_is_uninitialized(ex); + split_flag1 = 0; if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_DATA_VALID2); - if (uninitialized) + split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; + if (uninitialized) { split_flag1 |= EXT4_EXT_MARK_UNINIT1; - if (split_flag & EXT4_EXT_MARK_UNINIT2) - split_flag1 |= EXT4_EXT_MARK_UNINIT2; + split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT2); + } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); if (err) @@ -3127,7 +3118,7 @@ static int ext4_split_extent(handle_t *handle, ext4_ext_show_leaf(inode, path); out: - return err ? err : map->m_len; + return err ? err : allocated; } /* @@ -3182,6 +3173,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); + zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3272,13 +3264,16 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> - inode->i_sb->s_blocksize_bits; + (inode->i_sb->s_blocksize_bits - 10); /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { err = ext4_ext_zeroout(inode, ex); if (err) goto out; + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3337,6 +3332,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = allocated; out: + /* If we have gotten a failure, don't zero out status tree */ + if (!err) + err = ext4_es_zeroout(inode, &zero_ex); return err ? err : allocated; } @@ -3419,8 +3417,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, "block %llu, max_blocks %u\n", inode->i_ino, (unsigned long long)ee_block, ee_len); - /* If extent is larger than requested then split is required */ + /* If extent is larger than requested it is a clear sign that we still + * have some extent state machine issues left. So extent_split is still + * required. + * TODO: Once all related issues will be fixed this situation should be + * illegal. + */ if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG + ext4_warning("Inode (%ld) finished: extent logical block %llu," + " len %u; IO logical block %llu, len %u\n", + inode->i_ino, (unsigned long long)ee_block, ee_len, + (unsigned long long)map->m_lblk, map->m_len); +#endif err = ext4_split_unwritten_extents(handle, inode, map, path, EXT4_GET_BLOCKS_CONVERT); if (err < 0) @@ -3519,19 +3528,19 @@ out: * * Return 1 if there is a delalloc block in the range, otherwise 0. */ -static int ext4_find_delalloc_range(struct inode *inode, - ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end) +int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end) { struct extent_status es; - es.start = lblk_start; - ext4_es_find_extent(inode, &es); - if (es.len == 0) + ext4_es_find_delayed_extent(inode, lblk_start, &es); + if (es.es_len == 0) return 0; /* there is no delay extent in this tree */ - else if (es.start <= lblk_start && lblk_start < es.start + es.len) + else if (es.es_lblk <= lblk_start && + lblk_start < es.es_lblk + es.es_len) return 1; - else if (lblk_start <= es.start && es.start <= lblk_end) + else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) return 1; else return 0; @@ -3656,6 +3665,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ext4_set_io_unwritten_flag(inode, io); else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + map->m_flags |= EXT4_MAP_UNWRITTEN; if (ext4_should_dioread_nolock(inode)) map->m_flags |= EXT4_MAP_UNINIT; goto out; @@ -3670,6 +3680,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, path, map->m_len); } else err = ret; + map->m_flags |= EXT4_MAP_MAPPED; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; goto out2; } /* buffered IO case */ @@ -3677,8 +3691,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * repeat fallocate creation request * we already have an unwritten extent */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) { + map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; + } /* buffered READ or buffered write_begin() lookup */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { @@ -3717,6 +3733,7 @@ out: allocated - map->m_len); allocated = map->m_len; } + map->m_len = allocated; /* * If we have done fallocate with the offset that is already @@ -3898,35 +3915,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_lblk, map->m_len, inode->i_ino); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); - /* check in cache */ - if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { - if (!newex.ee_start_lo && !newex.ee_start_hi) { - if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - /* - * block isn't allocated yet and - * user doesn't want to allocate it - */ - goto out2; - } - /* we should allocate requested block */ - } else { - /* block is already allocated */ - if (sbi->s_cluster_ratio > 1) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - newblock = map->m_lblk - - le32_to_cpu(newex.ee_block) - + ext4_ext_pblock(&newex); - /* number of remaining blocks in the extent */ - allocated = ext4_ext_get_actual_len(&newex) - - (map->m_lblk - le32_to_cpu(newex.ee_block)); - goto out; - } - } - /* find extent for this block */ path = ext4_ext_find_extent(inode, map->m_lblk, NULL); if (IS_ERR(path)) { @@ -3973,15 +3961,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - /* - * Do not put uninitialized extent - * in the cache - */ - if (!ext4_ext_is_uninitialized(ex)) { - ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start); + if (!ext4_ext_is_uninitialized(ex)) goto out; - } + allocated = ext4_ext_handle_uninitialized_extents( handle, inode, map, path, flags, allocated, newblock); @@ -4002,7 +3984,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * put just found gap into cache to speed up * subsequent requests */ - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } @@ -4108,6 +4091,7 @@ got_allocated_blocks: /* Mark uninitialized */ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ ext4_ext_mark_uninitialized(&newex); + map->m_flags |= EXT4_MAP_UNWRITTEN; /* * io_end structure was created for every IO write to an * uninitialized extent. To avoid unnecessary conversion, @@ -4181,9 +4165,6 @@ got_allocated_blocks: } } else { BUG_ON(allocated_clusters < reserved_clusters); - /* We will claim quota for all newly allocated blocks.*/ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); int reservation = allocated_clusters - @@ -4234,6 +4215,15 @@ got_allocated_blocks: ei->i_reserved_data_blocks += reservation; spin_unlock(&ei->i_block_reservation_lock); } + /* + * We will claim quota for all newly allocated blocks. + * We're updating the reserved space *after* the + * correction above so we do not accidentally free + * all the metadata reservation because we might + * actually need it later on. + */ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); } } @@ -4241,10 +4231,9 @@ got_allocated_blocks: * Cache the extent and update transaction to commit on fdatasync only * when it is _not_ an uninitialized extent. */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { - ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); - } else + else ext4_update_inode_fsync_trans(handle, inode, 0); out: if (allocated > map->m_len) @@ -4284,7 +4273,7 @@ void ext4_ext_truncate(struct inode *inode) * probably first extent we're gonna free will be last in block */ err = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, err); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err); if (IS_ERR(handle)) return; @@ -4303,7 +4292,6 @@ void ext4_ext_truncate(struct inode *inode) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); - ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); @@ -4386,7 +4374,7 @@ static void ext4_falloc_update_inode(struct inode *inode, */ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); handle_t *handle; loff_t new_size; unsigned int max_blocks; @@ -4397,13 +4385,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; - /* - * currently supporting (pre)allocate mode for extent-based - * files _only_ - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return -EOPNOTSUPP; - /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; @@ -4415,6 +4396,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; + /* + * currently supporting (pre)allocate mode for extent-based + * files _only_ + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EOPNOTSUPP; + trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* @@ -4445,13 +4433,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (len <= EXT_UNINIT_MAX_LEN << blkbits) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Prevent race condition between unwritten */ - ext4_flush_unwritten_io(inode); retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; map.m_len = max_blocks = max_blocks - ret; - handle = ext4_journal_start(inode, credits); + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; @@ -4459,11 +4446,11 @@ retry: ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { #ifdef EXT4FS_DEBUG - WARN_ON(ret <= 0); - printk(KERN_ERR "%s: ext4_ext_map_blocks " - "returned error inode#%lu, block=%u, " - "max_blocks=%u", __func__, - inode->i_ino, map.m_lblk, max_blocks); + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); #endif ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); @@ -4529,21 +4516,19 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, while (ret >= 0 && ret < max_blocks) { map.m_lblk += ret; map.m_len = (max_blocks -= ret); - handle = ext4_journal_start(inode, credits); + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_IO_CONVERT_EXT); - if (ret <= 0) { - WARN_ON(ret <= 0); - ext4_msg(inode->i_sb, KERN_ERR, - "%s:%d: inode #%lu: block %u: len %u: " - "ext4_ext_map_blocks returned %d", - __func__, __LINE__, inode->i_ino, map.m_lblk, - map.m_len, ret); - } + if (ret <= 0) + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); if (ret <= 0 || ret2 ) @@ -4553,42 +4538,48 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, } /* - * If newex is not existing extent (newex->ec_start equals zero) find - * delayed extent at start of newex and update newex accordingly and + * If newes is not existing extent (newes->ec_pblk equals zero) find + * delayed extent at start of newes and update newes accordingly and * return start of the next delayed extent. * - * If newex is existing extent (newex->ec_start is not equal zero) + * If newes is existing extent (newes->ec_pblk is not equal zero) * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed - * extent found. Leave newex unmodified. + * extent found. Leave newes unmodified. */ static int ext4_find_delayed_extent(struct inode *inode, - struct ext4_ext_cache *newex) + struct extent_status *newes) { struct extent_status es; - ext4_lblk_t next_del; + ext4_lblk_t block, next_del; - es.start = newex->ec_block; - next_del = ext4_es_find_extent(inode, &es); + ext4_es_find_delayed_extent(inode, newes->es_lblk, &es); - if (newex->ec_start == 0) { + if (newes->es_pblk == 0) { /* - * No extent in extent-tree contains block @newex->ec_start, + * No extent in extent-tree contains block @newes->es_pblk, * then the block may stay in 1)a hole or 2)delayed-extent. */ - if (es.len == 0) + if (es.es_len == 0) /* A hole found. */ return 0; - if (es.start > newex->ec_block) { + if (es.es_lblk > newes->es_lblk) { /* A hole found. */ - newex->ec_len = min(es.start - newex->ec_block, - newex->ec_len); + newes->es_len = min(es.es_lblk - newes->es_lblk, + newes->es_len); return 0; } - newex->ec_len = es.start + es.len - newex->ec_block; + newes->es_len = es.es_lblk + es.es_len - newes->es_lblk; } + block = newes->es_lblk + newes->es_len; + ext4_es_find_delayed_extent(inode, block, &es); + if (es.es_len == 0) + next_del = EXT_MAX_BLOCKS; + else + next_del = es.es_lblk; + return next_del; } /* fiemap flags we can handle specified here */ @@ -4643,7 +4634,7 @@ static int ext4_xattr_fiemap(struct inode *inode, */ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; @@ -4709,7 +4700,7 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) inode_dio_wait(inode); credits = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, credits); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_dio; @@ -4786,14 +4777,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) goto out; down_write(&EXT4_I(inode)->i_data_sem); - ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); err = ext4_es_remove_extent(inode, first_block, stop_block - first_block); err = ext4_ext_remove_space(inode, first_block, stop_block - 1); - ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); if (IS_SYNC(inode)) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 564d981a2fcc..fe3337a85ede 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -23,40 +23,53 @@ * (e.g. Reservation space warning), and provide extent-level locking. * Delay extent tree is the first step to achieve this goal. It is * original built by Yongqiang Yang. At that time it is called delay - * extent tree, whose goal is only track delay extent in memory to + * extent tree, whose goal is only track delayed extents in memory to * simplify the implementation of fiemap and bigalloc, and introduce * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called - * delay extent tree at the following comment. But for better - * understand what it does, it has been rename to extent status tree. + * delay extent tree at the first commit. But for better understand + * what it does, it has been rename to extent status tree. * - * Currently the first step has been done. All delay extents are - * tracked in the tree. It maintains the delay extent when a delay - * allocation is issued, and the delay extent is written out or + * Step1: + * Currently the first step has been done. All delayed extents are + * tracked in the tree. It maintains the delayed extent when a delayed + * allocation is issued, and the delayed extent is written out or * invalidated. Therefore the implementation of fiemap and bigalloc * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. * * The following comment describes the implemenmtation of extent * status tree and future works. + * + * Step2: + * In this step all extent status are tracked by extent status tree. + * Thus, we can first try to lookup a block mapping in this tree before + * finding it in extent tree. Hence, single extent cache can be removed + * because extent status tree can do a better job. Extents in status + * tree are loaded on-demand. Therefore, the extent status tree may not + * contain all of the extents in a file. Meanwhile we define a shrinker + * to reclaim memory from extent status tree because fragmented extent + * tree will make status tree cost too much memory. written/unwritten/- + * hole extents in the tree will be reclaimed by this shrinker when we + * are under high memory pressure. Delayed extents will not be + * reclimed because fiemap, bigalloc, and seek_data/hole need it. */ /* - * extents status tree implementation for ext4. + * Extent status tree implementation for ext4. * * * ========================================================================== - * Extents status encompass delayed extents and extent locks + * Extent status tree tracks all extent status. * - * 1. Why delayed extent implementation ? + * 1. Why we need to implement extent status tree? * - * Without delayed extent, ext4 identifies a delayed extent by looking + * Without extent status tree, ext4 identifies a delayed extent by looking * up page cache, this has several deficiencies - complicated, buggy, * and inefficient code. * - * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need - * to know if a block or a range of blocks are belonged to a delayed - * extent. + * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a + * block or a range of blocks are belonged to a delayed extent. * - * Let us have a look at how they do without delayed extents implementation. + * Let us have a look at how they do without extent status tree. * -- FIEMAP * FIEMAP looks up page cache to identify delayed allocations from holes. * @@ -68,47 +81,48 @@ * already under delayed allocation or not to determine whether * quota reserving is needed for the cluster. * - * -- punch hole - * punch hole looks up page cache to identify a delayed extent. - * * -- writeout * Writeout looks up whole page cache to see if a buffer is * mapped, If there are not very many delayed buffers, then it is * time comsuming. * - * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, + * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA, * bigalloc and writeout can figure out if a block or a range of * blocks is under delayed allocation(belonged to a delayed extent) or - * not by searching the delayed extent tree. + * not by searching the extent tree. * * * ========================================================================== - * 2. ext4 delayed extents impelmentation + * 2. Ext4 extent status tree impelmentation + * + * -- extent + * A extent is a range of blocks which are contiguous logically and + * physically. Unlike extent in extent tree, this extent in ext4 is + * a in-memory struct, there is no corresponding on-disk data. There + * is no limit on length of extent, so an extent can contain as many + * blocks as they are contiguous logically and physically. * - * -- delayed extent - * A delayed extent is a range of blocks which are contiguous - * logically and under delayed allocation. Unlike extent in - * ext4, delayed extent in ext4 is a in-memory struct, there is - * no corresponding on-disk data. There is no limit on length of - * delayed extent, so a delayed extent can contain as many blocks - * as they are contiguous logically. + * -- extent status tree + * Every inode has an extent status tree and all allocation blocks + * are added to the tree with different status. The extent in the + * tree are ordered by logical block no. * - * -- delayed extent tree - * Every inode has a delayed extent tree and all under delayed - * allocation blocks are added to the tree as delayed extents. - * Delayed extents in the tree are ordered by logical block no. + * -- operations on a extent status tree + * There are three important operations on a delayed extent tree: find + * next extent, adding a extent(a range of blocks) and removing a extent. * - * -- operations on a delayed extent tree - * There are three operations on a delayed extent tree: find next - * delayed extent, adding a space(a range of blocks) and removing - * a space. + * -- race on a extent status tree + * Extent status tree is protected by inode->i_es_lock. * - * -- race on a delayed extent tree - * Delayed extent tree is protected inode->i_es_lock. + * -- memory consumption + * Fragmented extent tree will make extent status tree cost too much + * memory. Hence, we will reclaim written/unwritten/hole extents from + * the tree under a heavy memory pressure. * * * ========================================================================== - * 3. performance analysis + * 3. Performance analysis + * * -- overhead * 1. There is a cache extent for write access, so if writes are * not very random, adding space operaions are in O(1) time. @@ -120,18 +134,25 @@ * * ========================================================================== * 4. TODO list - * -- Track all extent status * - * -- Improve get block process + * -- Refactor delayed space reservation * * -- Extent-level locking */ static struct kmem_cache *ext4_es_cachep; +static int __es_insert_extent(struct inode *inode, struct extent_status *newes); +static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end); +static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + int nr_to_scan); + int __init ext4_init_es(void) { - ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); + ext4_es_cachep = kmem_cache_create("ext4_extent_status", + sizeof(struct extent_status), + 0, (SLAB_RECLAIM_ACCOUNT), NULL); if (ext4_es_cachep == NULL) return -ENOMEM; return 0; @@ -161,7 +182,9 @@ static void ext4_es_print_tree(struct inode *inode) while (node) { struct extent_status *es; es = rb_entry(node, struct extent_status, rb_node); - printk(KERN_DEBUG " [%u/%u)", es->start, es->len); + printk(KERN_DEBUG " [%u/%u) %llu %llx", + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); node = rb_next(node); } printk(KERN_DEBUG "\n"); @@ -170,10 +193,10 @@ static void ext4_es_print_tree(struct inode *inode) #define ext4_es_print_tree(inode) #endif -static inline ext4_lblk_t extent_status_end(struct extent_status *es) +static inline ext4_lblk_t ext4_es_end(struct extent_status *es) { - BUG_ON(es->start + es->len < es->start); - return es->start + es->len - 1; + BUG_ON(es->es_lblk + es->es_len < es->es_lblk); + return es->es_lblk + es->es_len - 1; } /* @@ -181,25 +204,25 @@ static inline ext4_lblk_t extent_status_end(struct extent_status *es) * it can't be found, try to find next extent. */ static struct extent_status *__es_tree_search(struct rb_root *root, - ext4_lblk_t offset) + ext4_lblk_t lblk) { struct rb_node *node = root->rb_node; struct extent_status *es = NULL; while (node) { es = rb_entry(node, struct extent_status, rb_node); - if (offset < es->start) + if (lblk < es->es_lblk) node = node->rb_left; - else if (offset > extent_status_end(es)) + else if (lblk > ext4_es_end(es)) node = node->rb_right; else return es; } - if (es && offset < es->start) + if (es && lblk < es->es_lblk) return es; - if (es && offset > extent_status_end(es)) { + if (es && lblk > ext4_es_end(es)) { node = rb_next(&es->rb_node); return node ? rb_entry(node, struct extent_status, rb_node) : NULL; @@ -209,79 +232,134 @@ static struct extent_status *__es_tree_search(struct rb_root *root, } /* - * ext4_es_find_extent: find the 1st delayed extent covering @es->start - * if it exists, otherwise, the next extent after @es->start. + * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk + * if it exists, otherwise, the next extent after @es->lblk. * * @inode: the inode which owns delayed extents + * @lblk: the offset where we start to search * @es: delayed extent that we found - * - * Returns the first block of the next extent after es, otherwise - * EXT_MAX_BLOCKS if no delay extent is found. - * Delayed extent is returned via @es. */ -ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) +void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es) { struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; struct rb_node *node; - ext4_lblk_t ret = EXT_MAX_BLOCKS; - trace_ext4_es_find_extent_enter(inode, es->start); + BUG_ON(es == NULL); + trace_ext4_es_find_delayed_extent_enter(inode, lblk); read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; - /* find delay extent in cache firstly */ + /* find extent in cache firstly */ + es->es_lblk = es->es_len = es->es_pblk = 0; if (tree->cache_es) { es1 = tree->cache_es; - if (in_range(es->start, es1->start, es1->len)) { - es_debug("%u cached by [%u/%u)\n", - es->start, es1->start, es1->len); + if (in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u) %llu %llx\n", + lblk, es1->es_lblk, es1->es_len, + ext4_es_pblock(es1), ext4_es_status(es1)); goto out; } } - es->len = 0; - es1 = __es_tree_search(&tree->root, es->start); + es1 = __es_tree_search(&tree->root, lblk); out: - if (es1) { - tree->cache_es = es1; - es->start = es1->start; - es->len = es1->len; - node = rb_next(&es1->rb_node); - if (node) { + if (es1 && !ext4_es_is_delayed(es1)) { + while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); - ret = es1->start; + if (ext4_es_is_delayed(es1)) + break; } } + if (es1 && ext4_es_is_delayed(es1)) { + tree->cache_es = es1; + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; + } + read_unlock(&EXT4_I(inode)->i_es_lock); - trace_ext4_es_find_extent_exit(inode, es, ret); - return ret; + ext4_es_lru_add(inode); + trace_ext4_es_find_delayed_extent_exit(inode, es); } static struct extent_status * -ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) +ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + ext4_fsblk_t pblk) { struct extent_status *es; es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); if (es == NULL) return NULL; - es->start = start; - es->len = len; + es->es_lblk = lblk; + es->es_len = len; + es->es_pblk = pblk; + + /* + * We don't count delayed extent because we never try to reclaim them + */ + if (!ext4_es_is_delayed(es)) { + EXT4_I(inode)->i_es_lru_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + } + return es; } -static void ext4_es_free_extent(struct extent_status *es) +static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) { + /* Decrease the lru counter when this es is not delayed */ + if (!ext4_es_is_delayed(es)) { + BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); + EXT4_I(inode)->i_es_lru_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + } + kmem_cache_free(ext4_es_cachep, es); } +/* + * Check whether or not two extents can be merged + * Condition: + * - logical block number is contiguous + * - physical block number is contiguous + * - status is equal + */ +static int ext4_es_can_be_merged(struct extent_status *es1, + struct extent_status *es2) +{ + if (ext4_es_status(es1) != ext4_es_status(es2)) + return 0; + + if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL) + return 0; + + if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) + return 0; + + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && + (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) + return 1; + + if (ext4_es_is_hole(es1)) + return 1; + + /* we need to check delayed extent is without unwritten status */ + if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) + return 1; + + return 0; +} + static struct extent_status * -ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) +ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) { + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct extent_status *es1; struct rb_node *node; @@ -290,10 +368,10 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) return es; es1 = rb_entry(node, struct extent_status, rb_node); - if (es->start == extent_status_end(es1) + 1) { - es1->len += es->len; + if (ext4_es_can_be_merged(es1, es)) { + es1->es_len += es->es_len; rb_erase(&es->rb_node, &tree->root); - ext4_es_free_extent(es); + ext4_es_free_extent(inode, es); es = es1; } @@ -301,8 +379,9 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) } static struct extent_status * -ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) +ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) { + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct extent_status *es1; struct rb_node *node; @@ -311,69 +390,230 @@ ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) return es; es1 = rb_entry(node, struct extent_status, rb_node); - if (es1->start == extent_status_end(es) + 1) { - es->len += es1->len; + if (ext4_es_can_be_merged(es, es1)) { + es->es_len += es1->es_len; rb_erase(node, &tree->root); - ext4_es_free_extent(es1); + ext4_es_free_extent(inode, es1); } return es; } -static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, - ext4_lblk_t len) +#ifdef ES_AGGRESSIVE_TEST +static void ext4_es_insert_extent_ext_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + ext4_fsblk_t ee_start; + unsigned short ee_len; + int depth, ee_status, es_status; + + path = ext4_ext_find_extent(inode, es->es_lblk, NULL); + if (IS_ERR(path)) + return; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (ex) { + + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + ee_len = ext4_ext_get_actual_len(ex); + + ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; + es_status = ext4_es_is_unwritten(es) ? 1 : 0; + + /* + * Make sure ex and es are not overlap when we try to insert + * a delayed/hole extent. + */ + if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { + if (in_range(es->es_lblk, ee_block, ee_len)) { + pr_warn("ES insert assertation failed for " + "inode: %lu we can find an extent " + "at block [%d/%d/%llu/%c], but we " + "want to add an delayed/hole extent " + "[%d/%d/%llu/%llx]\n", + inode->i_ino, ee_block, ee_len, + ee_start, ee_status ? 'u' : 'w', + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + goto out; + } + + /* + * We don't check ee_block == es->es_lblk, etc. because es + * might be a part of whole extent, vice versa. + */ + if (es->es_lblk < ee_block || + ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + goto out; + } + + if (ee_status ^ es_status) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + } + } else { + /* + * We can't find an extent on disk. So we need to make sure + * that we don't want to add an written/unwritten extent. + */ + if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "can't find an extent at block %d but we want " + "to add an written/unwritten extent " + "[%d/%d/%llu/%llx]\n", inode->i_ino, + es->es_lblk, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + } +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } +} + +static void ext4_es_insert_extent_ind_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_map_blocks map; + int retval; + + /* + * Here we call ext4_ind_map_blocks to lookup a block mapping because + * 'Indirect' structure is defined in indirect.c. So we couldn't + * access direct/indirect tree from outside. It is too dirty to define + * this function in indirect.c file. + */ + + map.m_lblk = es->es_lblk; + map.m_len = es->es_len; + + retval = ext4_ind_map_blocks(NULL, inode, &map, 0); + if (retval > 0) { + if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { + /* + * We want to add a delayed/hole extent but this + * block has been allocated. + */ + pr_warn("ES insert assertation failed for inode: %lu " + "We can find blocks but we want to add a " + "delayed/hole extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } else if (ext4_es_is_written(es)) { + if (retval != es->es_len) { + pr_warn("ES insert assertation failed for " + "inode: %lu retval %d != es_len %d\n", + inode->i_ino, retval, es->es_len); + return; + } + if (map.m_pblk != ext4_es_pblock(es)) { + pr_warn("ES insert assertation failed for " + "inode: %lu m_pblk %llu != " + "es_pblk %llu\n", + inode->i_ino, map.m_pblk, + ext4_es_pblock(es)); + return; + } + } else { + /* + * We don't need to check unwritten extent because + * indirect-based file doesn't have it. + */ + BUG_ON(1); + } + } else if (retval == 0) { + if (ext4_es_is_written(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "We can't find the block but we want to add " + "an written extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } + } +} + +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ + /* + * We don't need to worry about the race condition because + * caller takes i_data_sem locking. + */ + BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_es_insert_extent_ext_check(inode, es); + else + ext4_es_insert_extent_ind_check(inode, es); +} +#else +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) { +} +#endif + +static int __es_insert_extent(struct inode *inode, struct extent_status *newes) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node **p = &tree->root.rb_node; struct rb_node *parent = NULL; struct extent_status *es; - ext4_lblk_t end = offset + len - 1; - - BUG_ON(end < offset); - es = tree->cache_es; - if (es && offset == (extent_status_end(es) + 1)) { - es_debug("cached by [%u/%u)\n", es->start, es->len); - es->len += len; - es = ext4_es_try_to_merge_right(tree, es); - goto out; - } else if (es && es->start == end + 1) { - es_debug("cached by [%u/%u)\n", es->start, es->len); - es->start = offset; - es->len += len; - es = ext4_es_try_to_merge_left(tree, es); - goto out; - } else if (es && es->start <= offset && - end <= extent_status_end(es)) { - es_debug("cached by [%u/%u)\n", es->start, es->len); - goto out; - } while (*p) { parent = *p; es = rb_entry(parent, struct extent_status, rb_node); - if (offset < es->start) { - if (es->start == end + 1) { - es->start = offset; - es->len += len; - es = ext4_es_try_to_merge_left(tree, es); + if (newes->es_lblk < es->es_lblk) { + if (ext4_es_can_be_merged(newes, es)) { + /* + * Here we can modify es_lblk directly + * because it isn't overlapped. + */ + es->es_lblk = newes->es_lblk; + es->es_len += newes->es_len; + if (ext4_es_is_written(es) || + ext4_es_is_unwritten(es)) + ext4_es_store_pblock(es, + newes->es_pblk); + es = ext4_es_try_to_merge_left(inode, es); goto out; } p = &(*p)->rb_left; - } else if (offset > extent_status_end(es)) { - if (offset == extent_status_end(es) + 1) { - es->len += len; - es = ext4_es_try_to_merge_right(tree, es); + } else if (newes->es_lblk > ext4_es_end(es)) { + if (ext4_es_can_be_merged(es, newes)) { + es->es_len += newes->es_len; + es = ext4_es_try_to_merge_right(inode, es); goto out; } p = &(*p)->rb_right; } else { - if (extent_status_end(es) <= end) - es->len = offset - es->start + len; - goto out; + BUG_ON(1); + return -EINVAL; } } - es = ext4_es_alloc_extent(offset, len); + es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len, + newes->es_pblk); if (!es) return -ENOMEM; rb_link_node(&es->rb_node, parent, p); @@ -385,85 +625,168 @@ out: } /* - * ext4_es_insert_extent() adds a space to a delayed extent tree. - * Caller holds inode->i_es_lock. + * ext4_es_insert_extent() adds a space to a extent status tree. * * ext4_es_insert_extent is called by ext4_da_write_begin and * ext4_es_remove_extent. * * Return 0 on success, error code on failure. */ -int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, - ext4_lblk_t len) +int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned long long status) { - struct ext4_es_tree *tree; + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; int err = 0; - trace_ext4_es_insert_extent(inode, offset, len); - es_debug("add [%u/%u) to extent status tree of inode %lu\n", - offset, len, inode->i_ino); + es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n", + lblk, len, pblk, status, inode->i_ino); + + if (!len) + return 0; + + BUG_ON(end < lblk); + + newes.es_lblk = lblk; + newes.es_len = len; + ext4_es_store_pblock(&newes, pblk); + ext4_es_store_status(&newes, status); + trace_ext4_es_insert_extent(inode, &newes); + + ext4_es_insert_extent_check(inode, &newes); write_lock(&EXT4_I(inode)->i_es_lock); - tree = &EXT4_I(inode)->i_es_tree; - err = __es_insert_extent(tree, offset, len); + err = __es_remove_extent(inode, lblk, end); + if (err != 0) + goto error; + err = __es_insert_extent(inode, &newes); + +error: write_unlock(&EXT4_I(inode)->i_es_lock); + ext4_es_lru_add(inode); ext4_es_print_tree(inode); return err; } /* - * ext4_es_remove_extent() removes a space from a delayed extent tree. - * Caller holds inode->i_es_lock. + * ext4_es_lookup_extent() looks up an extent in extent status tree. * - * Return 0 on success, error code on failure. + * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. + * + * Return: 1 on found, 0 on not */ -int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, - ext4_lblk_t len) +int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es) { - struct rb_node *node; struct ext4_es_tree *tree; + struct extent_status *es1 = NULL; + struct rb_node *node; + int found = 0; + + trace_ext4_es_lookup_extent_enter(inode, lblk); + es_debug("lookup extent in block %u\n", lblk); + + tree = &EXT4_I(inode)->i_es_tree; + read_lock(&EXT4_I(inode)->i_es_lock); + + /* find extent in cache firstly */ + es->es_lblk = es->es_len = es->es_pblk = 0; + if (tree->cache_es) { + es1 = tree->cache_es; + if (in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u)\n", + lblk, es1->es_lblk, es1->es_len); + found = 1; + goto out; + } + } + + node = tree->root.rb_node; + while (node) { + es1 = rb_entry(node, struct extent_status, rb_node); + if (lblk < es1->es_lblk) + node = node->rb_left; + else if (lblk > ext4_es_end(es1)) + node = node->rb_right; + else { + found = 1; + break; + } + } + +out: + if (found) { + BUG_ON(!es1); + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; + } + + read_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_lru_add(inode); + trace_ext4_es_lookup_extent_exit(inode, es, found); + return found; +} + +static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct rb_node *node; struct extent_status *es; struct extent_status orig_es; - ext4_lblk_t len1, len2, end; + ext4_lblk_t len1, len2; + ext4_fsblk_t block; int err = 0; - trace_ext4_es_remove_extent(inode, offset, len); - es_debug("remove [%u/%u) from extent status tree of inode %lu\n", - offset, len, inode->i_ino); - - end = offset + len - 1; - BUG_ON(end < offset); - write_lock(&EXT4_I(inode)->i_es_lock); - tree = &EXT4_I(inode)->i_es_tree; - es = __es_tree_search(&tree->root, offset); + es = __es_tree_search(&tree->root, lblk); if (!es) goto out; - if (es->start > end) + if (es->es_lblk > end) goto out; /* Simply invalidate cache_es. */ tree->cache_es = NULL; - orig_es.start = es->start; - orig_es.len = es->len; - len1 = offset > es->start ? offset - es->start : 0; - len2 = extent_status_end(es) > end ? - extent_status_end(es) - end : 0; + orig_es.es_lblk = es->es_lblk; + orig_es.es_len = es->es_len; + orig_es.es_pblk = es->es_pblk; + + len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; + len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0; if (len1 > 0) - es->len = len1; + es->es_len = len1; if (len2 > 0) { if (len1 > 0) { - err = __es_insert_extent(tree, end + 1, len2); + struct extent_status newes; + + newes.es_lblk = end + 1; + newes.es_len = len2; + if (ext4_es_is_written(&orig_es) || + ext4_es_is_unwritten(&orig_es)) { + block = ext4_es_pblock(&orig_es) + + orig_es.es_len - len2; + ext4_es_store_pblock(&newes, block); + } + ext4_es_store_status(&newes, ext4_es_status(&orig_es)); + err = __es_insert_extent(inode, &newes); if (err) { - es->start = orig_es.start; - es->len = orig_es.len; + es->es_lblk = orig_es.es_lblk; + es->es_len = orig_es.es_len; goto out; } } else { - es->start = end + 1; - es->len = len2; + es->es_lblk = end + 1; + es->es_len = len2; + if (ext4_es_is_written(es) || + ext4_es_is_unwritten(es)) { + block = orig_es.es_pblk + orig_es.es_len - len2; + ext4_es_store_pblock(es, block); + } } goto out; } @@ -476,10 +799,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, es = NULL; } - while (es && extent_status_end(es) <= end) { + while (es && ext4_es_end(es) <= end) { node = rb_next(&es->rb_node); rb_erase(&es->rb_node, &tree->root); - ext4_es_free_extent(es); + ext4_es_free_extent(inode, es); if (!node) { es = NULL; break; @@ -487,14 +810,183 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, es = rb_entry(node, struct extent_status, rb_node); } - if (es && es->start < end + 1) { - len1 = extent_status_end(es) - end; - es->start = end + 1; - es->len = len1; + if (es && es->es_lblk < end + 1) { + ext4_lblk_t orig_len = es->es_len; + + len1 = ext4_es_end(es) - end; + es->es_lblk = end + 1; + es->es_len = len1; + if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { + block = es->es_pblk + orig_len - len1; + ext4_es_store_pblock(es, block); + } } out: + return err; +} + +/* + * ext4_es_remove_extent() removes a space from a extent status tree. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + ext4_lblk_t end; + int err = 0; + + trace_ext4_es_remove_extent(inode, lblk, len); + es_debug("remove [%u/%u) from extent status tree of inode %lu\n", + lblk, len, inode->i_ino); + + if (!len) + return err; + + end = lblk + len - 1; + BUG_ON(end < lblk); + + write_lock(&EXT4_I(inode)->i_es_lock); + err = __es_remove_extent(inode, lblk, end); write_unlock(&EXT4_I(inode)->i_es_lock); ext4_es_print_tree(inode); return err; } + +int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + +static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +{ + struct ext4_sb_info *sbi = container_of(shrink, + struct ext4_sb_info, s_es_shrinker); + struct ext4_inode_info *ei; + struct list_head *cur, *tmp, scanned; + int nr_to_scan = sc->nr_to_scan; + int ret, nr_shrunk = 0; + + ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + + if (!nr_to_scan) + return ret; + + INIT_LIST_HEAD(&scanned); + + spin_lock(&sbi->s_es_lru_lock); + list_for_each_safe(cur, tmp, &sbi->s_es_lru) { + list_move_tail(cur, &scanned); + + ei = list_entry(cur, struct ext4_inode_info, i_es_lru); + + read_lock(&ei->i_es_lock); + if (ei->i_es_lru_nr == 0) { + read_unlock(&ei->i_es_lock); + continue; + } + read_unlock(&ei->i_es_lock); + + write_lock(&ei->i_es_lock); + ret = __es_try_to_reclaim_extents(ei, nr_to_scan); + write_unlock(&ei->i_es_lock); + + nr_shrunk += ret; + nr_to_scan -= ret; + if (nr_to_scan == 0) + break; + } + list_splice_tail(&scanned, &sbi->s_es_lru); + spin_unlock(&sbi->s_es_lru_lock); + + ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); + return ret; +} + +void ext4_es_register_shrinker(struct super_block *sb) +{ + struct ext4_sb_info *sbi; + + sbi = EXT4_SB(sb); + INIT_LIST_HEAD(&sbi->s_es_lru); + spin_lock_init(&sbi->s_es_lru_lock); + sbi->s_es_shrinker.shrink = ext4_es_shrink; + sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; + register_shrinker(&sbi->s_es_shrinker); +} + +void ext4_es_unregister_shrinker(struct super_block *sb) +{ + unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); +} + +void ext4_es_lru_add(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lru_lock); + if (list_empty(&ei->i_es_lru)) + list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); + else + list_move_tail(&ei->i_es_lru, &sbi->s_es_lru); + spin_unlock(&sbi->s_es_lru_lock); +} + +void ext4_es_lru_del(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lru_lock); + if (!list_empty(&ei->i_es_lru)) + list_del_init(&ei->i_es_lru); + spin_unlock(&sbi->s_es_lru_lock); +} + +static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + int nr_to_scan) +{ + struct inode *inode = &ei->vfs_inode; + struct ext4_es_tree *tree = &ei->i_es_tree; + struct rb_node *node; + struct extent_status *es; + int nr_shrunk = 0; + + if (ei->i_es_lru_nr == 0) + return 0; + + node = rb_first(&tree->root); + while (node != NULL) { + es = rb_entry(node, struct extent_status, rb_node); + node = rb_next(&es->rb_node); + /* + * We can't reclaim delayed extent from status tree because + * fiemap, bigallic, and seek_data/hole need to use it. + */ + if (!ext4_es_is_delayed(es)) { + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + nr_shrunk++; + if (--nr_to_scan == 0) + break; + } + } + tree->cache_es = NULL; + return nr_shrunk; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 077f82db092a..d8e2d4dc311e 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -20,10 +20,32 @@ #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* + * These flags live in the high bits of extent_status.es_pblk + */ +#define EXTENT_STATUS_WRITTEN (1ULL << 63) +#define EXTENT_STATUS_UNWRITTEN (1ULL << 62) +#define EXTENT_STATUS_DELAYED (1ULL << 61) +#define EXTENT_STATUS_HOLE (1ULL << 60) + +#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) + +struct ext4_extent; + struct extent_status { struct rb_node rb_node; - ext4_lblk_t start; /* first block extent covers */ - ext4_lblk_t len; /* length of extent in block */ + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ }; struct ext4_es_tree { @@ -35,11 +57,70 @@ extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); -extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start, +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned long long status); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t len); -extern ext4_lblk_t ext4_es_find_extent(struct inode *inode, - struct extent_status *es); +extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es); +extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex); + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_HOLE) != 0; +} + +static inline ext4_fsblk_t ext4_es_status(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_FLAGS); +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return (es->es_pblk & ~EXTENT_STATUS_FLAGS); +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~EXTENT_STATUS_FLAGS) | + (es->es_pblk & EXTENT_STATUS_FLAGS); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned long long status) +{ + ext4_fsblk_t block; + + block = (status & EXTENT_STATUS_FLAGS) | + (es->es_pblk & ~EXTENT_STATUS_FLAGS); + es->es_pblk = block; +} + +extern void ext4_es_register_shrinker(struct super_block *sb); +extern void ext4_es_unregister_shrinker(struct super_block *sb); +extern void ext4_es_lru_add(struct inode *inode); +extern void ext4_es_lru_del(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 405565a62277..64848b595b24 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -167,7 +167,7 @@ static ssize_t ext4_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; /* @@ -240,7 +240,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) handle_t *handle; int err; - handle = ext4_journal_start_sb(sb, 1); + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) return PTR_ERR(handle); err = ext4_journal_get_write_access(handle, sbi->s_sbh); @@ -464,10 +464,8 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * it will be as a data. */ - es.start = last; - (void)ext4_es_find_extent(inode, &es); - if (last >= es.start && - last < es.start + es.len) { + ext4_es_find_delayed_extent(inode, last, &es); + if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { if (last != start) dataoff = last << blkbits; break; @@ -549,11 +547,9 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * we will skip this extent. */ - es.start = last; - (void)ext4_es_find_extent(inode, &es); - if (last >= es.start && - last < es.start + es.len) { - last = es.start + es.len; + ext4_es_find_delayed_extent(inode, last, &es); + if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { + last = es.es_lblk + es.es_len; holeoff = last << blkbits; continue; } diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index fa8e4911d354..3d586f02883e 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -155,11 +155,11 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) /* Check to see if the seed is all zero's */ if (hinfo->seed) { for (i = 0; i < 4; i++) { - if (hinfo->seed[i]) + if (hinfo->seed[i]) { + memcpy(buf, hinfo->seed, sizeof(buf)); break; + } } - if (i < 4) - memcpy(buf, hinfo->seed, sizeof(buf)); } switch (hinfo->hash_version) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3f32c8012447..6c5bb8d993fe 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -324,8 +324,8 @@ error_return: } struct orlov_stats { + __u64 free_clusters; __u32 free_inodes; - __u32 free_clusters; __u32 used_dirs; }; @@ -342,7 +342,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_clusters = atomic_read(&flex_group[g].free_clusters); + stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } @@ -634,8 +634,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, - const struct qstr *qstr, __u32 goal, uid_t *owner) +struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, + umode_t mode, const struct qstr *qstr, + __u32 goal, uid_t *owner, int handle_type, + unsigned int line_no, int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -725,6 +727,15 @@ repeat_in_this_group: "inode=%lu", ino + 1); continue; } + if (!handle) { + BUG_ON(nblocks <= 0); + handle = __ext4_journal_start_sb(dir->i_sb, line_no, + handle_type, nblocks); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto fail; + } + } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode_bitmap_bh); if (err) @@ -1017,17 +1028,17 @@ iget_failed: inode = NULL; bad_orphan: ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); - printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", + printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext4_test_bit(bit, bitmap_bh->b_data)); - printk(KERN_NOTICE "inode=%p\n", inode); + printk(KERN_WARNING "inode=%p\n", inode); if (inode) { - printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + printk(KERN_WARNING "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); - printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", + printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); - printk(KERN_NOTICE "max_ino=%lu\n", max_ino); - printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); + printk(KERN_WARNING "max_ino=%lu\n", max_ino); + printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; @@ -1137,7 +1148,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) goto out; - handle = ext4_journal_start_sb(sb, 1); + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 20862f96e8ae..b505a145a593 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -146,6 +146,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, struct super_block *sb = inode->i_sb; Indirect *p = chain; struct buffer_head *bh; + int ret = -EIO; *err = 0; /* i_data is not going away, no lock needed */ @@ -154,8 +155,10 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, goto no_block; while (--depth) { bh = sb_getblk(sb, le32_to_cpu(p->key)); - if (unlikely(!bh)) + if (unlikely(!bh)) { + ret = -ENOMEM; goto failure; + } if (!bh_uptodate_or_lock(bh)) { if (bh_submit_read(bh) < 0) { @@ -177,7 +180,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, return NULL; failure: - *err = -EIO; + *err = ret; no_block: return p; } @@ -355,9 +358,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, * for the first direct block */ new_blocks[index] = current_block; - printk(KERN_INFO "%s returned more blocks than " + WARN(1, KERN_INFO "%s returned more blocks than " "requested\n", __func__); - WARN_ON(1); break; } } @@ -471,7 +473,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, */ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); if (unlikely(!bh)) { - err = -EIO; + err = -ENOMEM; goto failed; } @@ -789,7 +791,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, if (final_size > inode->i_size) { /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -849,7 +851,7 @@ locked: int err; /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { /* This is really bad luck. We've written the data * but cannot extend i_size. Bail out and pretend @@ -948,7 +950,8 @@ static handle_t *start_transaction(struct inode *inode) { handle_t *result; - result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); + result = ext4_journal_start(inode, EXT4_HT_TRUNCATE, + ext4_blocks_for_truncate(inode)); if (!IS_ERR(result)) return result; @@ -1515,3 +1518,243 @@ out_stop: trace_ext4_truncate_exit(inode); } +static int free_hole_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, __le32 *i_data, + int level, ext4_lblk_t first, + ext4_lblk_t count, int max) +{ + struct buffer_head *bh = NULL; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ret = 0; + int i, inc; + ext4_lblk_t offset; + __le32 blk; + + inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level); + for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) { + if (offset >= count + first) + break; + if (*i_data == 0 || (offset + inc) <= first) + continue; + blk = *i_data; + if (level > 0) { + ext4_lblk_t first2; + bh = sb_bread(inode->i_sb, blk); + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, blk, + "Read failure"); + return -EIO; + } + first2 = (first > offset) ? first - offset : 0; + ret = free_hole_blocks(handle, inode, bh, + (__le32 *)bh->b_data, level - 1, + first2, count - offset, + inode->i_sb->s_blocksize >> 2); + if (ret) { + brelse(bh); + goto err; + } + } + if (level == 0 || + (bh && all_zeroes((__le32 *)bh->b_data, + (__le32 *)bh->b_data + addr_per_block))) { + ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); + *i_data = 0; + } + brelse(bh); + bh = NULL; + } + +err: + return ret; +} + +static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop) +{ + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int level, ret = 0; + int num = EXT4_NDIR_BLOCKS; + ext4_lblk_t count, max = EXT4_NDIR_BLOCKS; + __le32 *i_data = EXT4_I(inode)->i_data; + + count = stop - first; + for (level = 0; level < 4; level++, max *= addr_per_block) { + if (first < max) { + ret = free_hole_blocks(handle, inode, NULL, i_data, + level, first, count, num); + if (ret) + goto err; + if (count > max - first) + count -= max - first; + else + break; + first = 0; + } else { + first -= max; + } + i_data += num; + if (level == 0) { + num = 1; + max = 1; + } + } + +err: + return ret; +} + +int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + ext4_lblk_t first_block, stop_block; + struct address_space *mapping = inode->i_mapping; + handle_t *handle = NULL; + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; + int err = 0; + + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + err = filemap_write_and_wait_range(mapping, + offset, offset + length - 1); + if (err) + return err; + } + + mutex_lock(&inode->i_mutex); + /* It's not possible punch hole on append only file */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + err = -EPERM; + goto out_mutex; + } + if (IS_SWAPFILE(inode)) { + err = -ETXTBSY; + goto out_mutex; + } + + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + goto out_mutex; + + /* + * If the hole extents beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; + } + + first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + last_page = (offset + length) >> PAGE_CACHE_SHIFT; + + first_page_offset = first_page << PAGE_CACHE_SHIFT; + last_page_offset = last_page << PAGE_CACHE_SHIFT; + + /* Now release the pages */ + if (last_page_offset > first_page_offset) { + truncate_pagecache_range(inode, first_page_offset, + last_page_offset - 1); + } + + /* Wait all existing dio works, newcomers will block on i_mutex */ + inode_dio_wait(inode); + + handle = start_transaction(inode); + if (IS_ERR(handle)) + goto out_mutex; + + /* + * Now we need to zero out the non-page-aligned data in the + * pages at the start and tail of the hole, and unmap the buffer + * heads for the block aligned regions of the page that were + * completely zerod. + */ + if (first_page > last_page) { + /* + * If the file space being truncated is contained within a page + * just zero out and unmap the middle of that page + */ + err = ext4_discard_partial_page_buffers(handle, + mapping, offset, length, 0); + if (err) + goto out; + } else { + /* + * Zero out and unmap the paritial page that contains + * the start of the hole + */ + page_len = first_page_offset - offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + offset, page_len, 0); + if (err) + goto out; + } + + /* + * Zero out and unmap the partial page that contains + * the end of the hole + */ + page_len = offset + length - last_page_offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + last_page_offset, page_len, 0); + if (err) + goto out; + } + } + + /* + * If i_size contained in the last page, we need to + * unmap and zero the paritial page after i_size + */ + if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && + inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + if (err) + goto out; + } + } + + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + + if (first_block >= stop_block) + goto out; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + err = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); + err = ext4_free_hole_blocks(handle, inode, first_block, stop_block); + + ext4_discard_preallocations(inode); + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + up_write(&EXT4_I(inode)->i_data_sem); + +out: + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + +out_mutex: + mutex_unlock(&inode->i_mutex); + + return err; +} diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 387c47c6cda9..c0fd1a123f7d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -545,7 +545,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, return ret; retry: - handle = ext4_journal_start(inode, needed_blocks); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; @@ -657,7 +657,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, * The possible write could happen in the inode, * so try to reserve the space in inode first. */ - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; @@ -853,7 +853,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, if (ret) return ret; - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; @@ -1188,7 +1188,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, data_bh = sb_getblk(inode->i_sb, map.m_pblk); if (!data_bh) { - error = -EIO; + error = -ENOMEM; goto out_restore; } @@ -1298,7 +1298,7 @@ int ext4_read_inline_dir(struct file *filp, int i, stored; struct ext4_dir_entry_2 *de; struct super_block *sb; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; @@ -1770,7 +1770,7 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) needed_blocks = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, needed_blocks); + handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); if (IS_ERR(handle)) return; @@ -1862,7 +1862,7 @@ int ext4_convert_inline_data(struct inode *inode) if (error) return error; - handle = ext4_journal_start(inode, needed_blocks); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_free; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cbfe13bf5b2a..b3a5213bc73e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -132,10 +132,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, } static void ext4_invalidatepage(struct page *page, unsigned long offset); -static int noalloc_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, @@ -189,8 +185,6 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); - ext4_ioend_wait(inode); - if (inode->i_nlink) { /* * When journalling data dirty buffers are tracked only in the @@ -211,7 +205,8 @@ void ext4_evict_inode(struct inode *inode) * don't use page cache. */ if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT4_JOURNAL_INO) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; @@ -220,6 +215,7 @@ void ext4_evict_inode(struct inode *inode) filemap_write_and_wait(&inode->i_data); } truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); goto no_delete; } @@ -229,6 +225,7 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); if (is_bad_inode(inode)) goto no_delete; @@ -238,7 +235,8 @@ void ext4_evict_inode(struct inode *inode) * protection against it */ sb_start_intwrite(inode->i_sb); - handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, + ext4_blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -346,7 +344,7 @@ void ext4_da_update_reserve_space(struct inode *inode, spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { - ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " + ext4_warning(inode->i_sb, "%s: ino %lu, used %d " "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); @@ -355,10 +353,12 @@ void ext4_da_update_reserve_space(struct inode *inode, } if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) { - ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d " - "with only %d reserved metadata blocks\n", __func__, - inode->i_ino, ei->i_allocated_meta_blocks, - ei->i_reserved_meta_blocks); + ext4_warning(inode->i_sb, "ino %lu, allocated %d " + "with only %d reserved metadata blocks " + "(releasing %d blocks with reserved %d data blocks)", + inode->i_ino, ei->i_allocated_meta_blocks, + ei->i_reserved_meta_blocks, used, + ei->i_reserved_data_blocks); WARN_ON(1); ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks; } @@ -483,6 +483,58 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, return num; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_map_blocks_es_recheck(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *es_map, + struct ext4_map_blocks *map, + int flags) +{ + int retval; + + map->m_flags = 0; + /* + * There is a race window that the result is not the same. + * e.g. xfstests #223 when dioread_nolock enables. The reason + * is that we lookup a block mapping in extent status tree with + * out taking i_data_sem. So at the time the unwritten extent + * could be converted. + */ + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + up_read((&EXT4_I(inode)->i_data_sem)); + /* + * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag + * because it shouldn't be marked in es_map->m_flags. + */ + map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); + + /* + * We don't check m_len because extent will be collpased in status + * tree. So the m_len might not equal. + */ + if (es_map->m_lblk != map->m_lblk || + es_map->m_flags != map->m_flags || + es_map->m_pblk != map->m_pblk) { + printk("ES cache assertation failed for inode: %lu " + "es_cached ex [%d/%d/%llu/%x] != " + "found ex [%d/%d/%llu/%x] retval %d flags %x\n", + inode->i_ino, es_map->m_lblk, es_map->m_len, + es_map->m_pblk, es_map->m_flags, map->m_lblk, + map->m_len, map->m_pblk, map->m_flags, + retval, flags); + } +} +#endif /* ES_AGGRESSIVE_TEST */ + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -508,12 +560,42 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { + struct extent_status es; int retval; +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif map->m_flags = 0; ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { + map->m_pblk = ext4_es_pblock(&es) + + map->m_lblk - es.es_lblk; + map->m_flags |= ext4_es_is_written(&es) ? + EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; + retval = es.es_len - (map->m_lblk - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { + retval = 0; + } else { + BUG_ON(1); + } +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(handle, inode, map, + &orig_map, flags); +#endif + goto found; + } + /* * Try to see if we can get the block without requesting a new * file system block. @@ -527,20 +609,36 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, retval = ext4_ind_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); } + if (retval > 0) { + int ret; + unsigned long long status; + +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + ext4_find_delalloc_range(inode, map->m_lblk, + map->m_lblk + map->m_len - 1)) + status |= EXTENT_STATUS_DELAYED; + ret = ext4_es_insert_extent(inode, map->m_lblk, + map->m_len, map->m_pblk, status); + if (ret < 0) + retval = ret; + } if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) up_read((&EXT4_I(inode)->i_data_sem)); +found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret; - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - /* delayed alloc may be allocated by fallocate and - * coverted to initialized by directIO. - * we need to handle delayed extent here. - */ - down_write((&EXT4_I(inode)->i_data_sem)); - goto delayed_mapped; - } - ret = check_block_validity(inode, map); + int ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -560,16 +658,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, return retval; /* - * When we call get_blocks without the create flag, the - * BH_Unwritten flag could have gotten set if the blocks - * requested were part of a uninitialized extent. We need to - * clear this flag now that we are committed to convert all or - * part of the uninitialized extent to be an initialized - * extent. This is because we need to avoid the combination - * of BH_Unwritten and BH_Mapped flags being simultaneously - * set on the buffer_head. + * Here we clear m_flags because after allocating an new extent, + * it will be set again. */ - map->m_flags &= ~EXT4_MAP_UNWRITTEN; + map->m_flags &= ~EXT4_MAP_FLAGS; /* * New blocks allocate and/or writing to uninitialized extent @@ -615,20 +707,44 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) ext4_da_update_reserve_space(inode, retval, 1); } - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret; -delayed_mapped: - /* delayed allocation blocks has been allocated */ - ret = ext4_es_remove_extent(inode, map->m_lblk, - map->m_len); - if (ret < 0) - retval = ret; + if (retval > 0) { + int ret; + unsigned long long status; + +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (allocation)\n", inode->i_ino, retval, + map->m_len, __func__); } +#endif + + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO) && + ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es)) + goto has_zeroout; + } + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + ext4_find_delalloc_range(inode, map->m_lblk, + map->m_lblk + map->m_len - 1)) + status |= EXTENT_STATUS_DELAYED; + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + if (ret < 0) + retval = ret; } +has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret = check_block_validity(inode, map); @@ -660,7 +776,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, if (map.m_len > DIO_MAX_BLOCKS) map.m_len = DIO_MAX_BLOCKS; dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); - handle = ext4_journal_start(inode, dio_credits); + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + dio_credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); return ret; @@ -707,14 +824,16 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, /* ensure we send some value back into *errp */ *errp = 0; + if (create && err == 0) + err = -ENOSPC; /* should never happen */ if (err < 0) *errp = err; if (err <= 0) return NULL; bh = sb_getblk(inode->i_sb, map.m_pblk); - if (!bh) { - *errp = -EIO; + if (unlikely(!bh)) { + *errp = -ENOMEM; return NULL; } if (map.m_flags & EXT4_MAP_NEW) { @@ -808,11 +927,10 @@ int ext4_walk_page_buffers(handle_t *handle, * and the commit_write(). So doing the jbd2_journal_start at the start of * prepare_write() is the right place. * - * Also, this function can nest inside ext4_writepage() -> - * block_write_full_page(). In that case, we *know* that ext4_writepage() - * has generated enough buffer credits to do the whole page. So we won't - * block on the journal in that case, which is good, because the caller may - * be PF_MEMALLOC. + * Also, this function can nest inside ext4_writepage(). In that case, we + * *know* that ext4_writepage() has generated enough buffer credits to do the + * whole page. So we won't block on the journal in that case, which is good, + * because the caller may be PF_MEMALLOC. * * By accident, ext4 can be reentered when a transaction is open via * quota file writes. If we were to commit the transaction while thus @@ -878,32 +996,40 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, flags, pagep); if (ret < 0) - goto out; - if (ret == 1) { - ret = 0; - goto out; - } + return ret; + if (ret == 1) + return 0; } -retry: - handle = ext4_journal_start(inode, needed_blocks); + /* + * grab_cache_page_write_begin() can take a long time if the + * system is thrashing due to memory pressure, or if the page + * is being written back. So grab it first before we start + * the transaction handle. This also allows us to allocate + * the page (if needed) without using GFP_NOFS. + */ +retry_grab: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + unlock_page(page); + +retry_journal: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; + page_cache_release(page); + return PTR_ERR(handle); } - /* We cannot recurse into the filesystem as the transaction is already - * started */ - flags |= AOP_FLAG_NOFS; - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + unlock_page(page); + page_cache_release(page); ext4_journal_stop(handle); - ret = -ENOMEM; - goto out; + goto retry_grab; } - - *pagep = page; + wait_on_page_writeback(page); if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(page, pos, len, ext4_get_block_write); @@ -918,7 +1044,6 @@ retry: if (ret) { unlock_page(page); - page_cache_release(page); /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -942,11 +1067,14 @@ retry: if (inode->i_nlink) ext4_orphan_del(NULL, inode); } - } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + page_cache_release(page); + return ret; + } + *pagep = page; return ret; } @@ -1178,6 +1306,55 @@ static int ext4_journalled_write_end(struct file *file, } /* + * Reserve a metadata for a single block located at lblock + */ +static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) +{ + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int md_needed; + ext4_lblk_t save_last_lblock; + int save_len; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ +repeat: + spin_lock(&ei->i_block_reservation_lock); + /* + * ext4_calc_metadata_amount() has side effects, which we have + * to be prepared undo if we fail to claim space. + */ + save_len = ei->i_da_metadata_calc_len; + save_last_lblock = ei->i_da_metadata_calc_last_lblock; + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); + trace_ext4_da_reserve_space(inode, md_needed); + + /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ + if (ext4_claim_free_clusters(sbi, md_needed, 0)) { + ei->i_da_metadata_calc_len = save_len; + ei->i_da_metadata_calc_last_lblock = save_last_lblock; + spin_unlock(&ei->i_block_reservation_lock); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + cond_resched(); + goto repeat; + } + return -ENOSPC; + } + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); + + return 0; /* success */ +} + +/* * Reserve a single cluster located at lblock */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) @@ -1225,7 +1402,7 @@ repeat: ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - yield(); + cond_resched(); goto repeat; } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); @@ -1256,7 +1433,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) * function is called from invalidate page, it's * harmless to return without any action. */ - ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " + ext4_warning(inode->i_sb, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); @@ -1357,7 +1534,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, loff_t size = i_size_read(inode); unsigned int len, block_start; struct buffer_head *bh, *page_bufs = NULL; - int journal_data = ext4_should_journal_data(inode); sector_t pblock = 0, cur_logical = 0; struct ext4_io_submit io_submit; @@ -1378,7 +1554,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { - int commit_write = 0, skip_page = 0; + int skip_page = 0; struct page *page = pvec.pages[i]; index = page->index; @@ -1400,27 +1576,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - /* - * If the page does not have buffers (for - * whatever reason), try to create them using - * __block_write_begin. If this fails, - * skip the page and move on. - */ - if (!page_has_buffers(page)) { - if (__block_write_begin(page, 0, len, - noalloc_get_block_write)) { - skip_page: - unlock_page(page); - continue; - } - commit_write = 1; - } - bh = page_bufs = page_buffers(page); block_start = 0; do { - if (!bh) - goto skip_page; if (map && (cur_logical >= map->m_lblk) && (cur_logical <= (map->m_lblk + (map->m_len - 1)))) { @@ -1448,33 +1606,14 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, pblock++; } while (bh != page_bufs); - if (skip_page) - goto skip_page; - - if (commit_write) - /* mark the buffer_heads as dirty & uptodate */ - block_commit_write(page, 0, len); + if (skip_page) { + unlock_page(page); + continue; + } clear_page_dirty_for_io(page); - /* - * Delalloc doesn't support data journalling, - * but eventually maybe we'll lift this - * restriction. - */ - if (unlikely(journal_data && PageChecked(page))) - err = __ext4_journalled_writepage(page, len); - else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) - err = ext4_bio_write_page(&io_submit, page, - len, mpd->wbc); - else if (buffer_uninit(page_bufs)) { - ext4_set_bh_endio(page_bufs, inode); - err = block_write_full_page_endio(page, - noalloc_get_block_write, - mpd->wbc, ext4_end_io_buffer_write); - } else - err = block_write_full_page(page, - noalloc_get_block_write, mpd->wbc); - + err = ext4_bio_write_page(&io_submit, page, len, + mpd->wbc); if (!err) mpd->pages_written++; /* @@ -1640,7 +1779,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) (unsigned long long) next, mpd->b_size >> mpd->inode->i_blkbits, err); ext4_msg(sb, KERN_CRIT, - "This should not happen!! Data will be lost\n"); + "This should not happen!! Data will be lost"); if (err == -ENOSPC) ext4_print_free_blocks(mpd->inode); } @@ -1690,16 +1829,16 @@ submit_io: * * @mpd->lbh - extent of blocks * @logical - logical number of the block in the file - * @bh - bh of the block (used to access block's state) + * @b_state - b_state of the buffer head added * * the function is used to collect contig. blocks in same state */ -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, - sector_t logical, size_t b_size, +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, unsigned long b_state) { sector_t next; - int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; + int blkbits = mpd->inode->i_blkbits; + int nrblocks = mpd->b_size >> blkbits; /* * XXX Don't go larger than mballoc is willing to allocate @@ -1707,11 +1846,11 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, * mpage_da_submit_io() into this function and then call * ext4_map_blocks() multiple times in a loop */ - if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) + if (nrblocks >= (8*1024*1024 >> blkbits)) goto flush_it; - /* check if thereserved journal credits might overflow */ - if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { + /* check if the reserved journal credits might overflow */ + if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) { if (nrblocks >= EXT4_MAX_TRANS_DATA) { /* * With non-extent format we are limited by the journal @@ -1720,16 +1859,6 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, * nrblocks. So limit nrblocks. */ goto flush_it; - } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > - EXT4_MAX_TRANS_DATA) { - /* - * Adding the new buffer_head would make it cross the - * allowed limit for which we have journal credit - * reserved. So limit the new bh->b_size - */ - b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << - mpd->inode->i_blkbits; - /* we will do mpage_da_submit_io in the next loop */ } } /* @@ -1737,7 +1866,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, */ if (mpd->b_size == 0) { mpd->b_blocknr = logical; - mpd->b_size = b_size; + mpd->b_size = 1 << blkbits; mpd->b_state = b_state & BH_FLAGS; return; } @@ -1747,7 +1876,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, * Can we merge the block to our big extent? */ if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { - mpd->b_size += b_size; + mpd->b_size += 1 << blkbits; return; } @@ -1775,8 +1904,14 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct ext4_map_blocks *map, struct buffer_head *bh) { + struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; @@ -1785,6 +1920,45 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," "logical block %lu\n", inode->i_ino, map->m_len, (unsigned long) map->m_lblk); + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, iblock, &es)) { + + if (ext4_es_is_hole(&es)) { + retval = 0; + down_read((&EXT4_I(inode)->i_data_sem)); + goto add_delayed; + } + + /* + * Delayed extent could be allocated by fallocate. + * So we need to check it. + */ + if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + + map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; + retval = es.es_len - (iblock - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + if (ext4_es_is_written(&es)) + map->m_flags |= EXT4_MAP_MAPPED; + else if (ext4_es_is_unwritten(&es)) + map->m_flags |= EXT4_MAP_UNWRITTEN; + else + BUG_ON(1); + +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); +#endif + return retval; + } + /* * Try to see if we can get the block without requesting a new * file system block. @@ -1803,27 +1977,46 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, map->m_flags |= EXT4_MAP_FROM_CLUSTER; retval = 0; } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, 0); + retval = ext4_ext_map_blocks(NULL, inode, map, + EXT4_GET_BLOCKS_NO_PUT_HOLE); else - retval = ext4_ind_map_blocks(NULL, inode, map, 0); + retval = ext4_ind_map_blocks(NULL, inode, map, + EXT4_GET_BLOCKS_NO_PUT_HOLE); +add_delayed: if (retval == 0) { + int ret; /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* If the block was allocated from previously allocated cluster, - * then we dont need to reserve it again. */ + /* + * If the block was allocated from previously allocated cluster, + * then we don't need to reserve it again. However we still need + * to reserve metadata for every block we're going to write. + */ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { - retval = ext4_da_reserve_space(inode, iblock); - if (retval) + ret = ext4_da_reserve_space(inode, iblock); + if (ret) { /* not enough space to reserve */ + retval = ret; goto out_unlock; + } + } else { + ret = ext4_da_reserve_metadata(inode, iblock); + if (ret) { + /* not enough space to reserve */ + retval = ret; + goto out_unlock; + } } - retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len); - if (retval) + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + ~0, EXTENT_STATUS_DELAYED); + if (ret) { + retval = ret; goto out_unlock; + } /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served * and it should not appear on the bh->b_state. @@ -1833,6 +2026,25 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); + } else if (retval > 0) { + int ret; + unsigned long long status; + +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + if (ret != 0) + retval = ret; } out_unlock: @@ -1890,27 +2102,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -/* - * This function is used as a standard get_block_t calback function - * when there is no desire to allocate any blocks. It is used as a - * callback function for block_write_begin() and block_write_full_page(). - * These functions should only try to map a single block at a time. - * - * Since this function doesn't do block allocations even if the caller - * requests it by passing in create=1, it is critically important that - * any caller checks to make sure that any buffer heads are returned - * by this function are either all already mapped or marked for - * delayed allocation before calling block_write_full_page(). Otherwise, - * b_blocknr could be left unitialized, and the page write functions will - * be taken by surprise. - */ -static int noalloc_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); - return _ext4_get_block(inode, iblock, bh_result, 0); -} - static int bget_one(handle_t *handle, struct buffer_head *bh) { get_bh(bh); @@ -1955,7 +2146,8 @@ static int __ext4_journalled_writepage(struct page *page, * references to buffers so we are safe */ unlock_page(page); - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -2035,11 +2227,12 @@ out: static int ext4_writepage(struct page *page, struct writeback_control *wbc) { - int ret = 0, commit_write = 0; + int ret = 0; loff_t size; unsigned int len; struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; + struct ext4_io_submit io_submit; trace_ext4_writepage(page); size = i_size_read(inode); @@ -2048,39 +2241,29 @@ static int ext4_writepage(struct page *page, else len = PAGE_CACHE_SIZE; + page_bufs = page_buffers(page); /* - * If the page does not have buffers (for whatever reason), - * try to create them using __block_write_begin. If this - * fails, redirty the page and move on. + * We cannot do block allocation or other extent handling in this + * function. If there are buffers needing that, we have to redirty + * the page. But we may reach here when we do a journal commit via + * journal_submit_inode_data_buffers() and in that case we must write + * allocated buffers to achieve data=ordered mode guarantees. */ - if (!page_has_buffers(page)) { - if (__block_write_begin(page, 0, len, - noalloc_get_block_write)) { - redirty_page: - redirty_page_for_writepage(wbc, page); + if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { + redirty_page_for_writepage(wbc, page); + if (current->flags & PF_MEMALLOC) { + /* + * For memory cleaning there's no point in writing only + * some buffers. So just bail out. Warn if we came here + * from direct reclaim. + */ + WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) + == PF_MEMALLOC); unlock_page(page); return 0; } - commit_write = 1; } - page_bufs = page_buffers(page); - if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - /* - * We don't want to do block allocation, so redirty - * the page and return. We may reach here when we do - * a journal commit via journal_submit_inode_data_buffers. - * We can also reach here via shrink_page_list but it - * should never be for direct reclaim so warn if that - * happens - */ - WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == - PF_MEMALLOC); - goto redirty_page; - } - if (commit_write) - /* now mark the buffer_heads as dirty and uptodate */ - block_commit_write(page, 0, len); if (PageChecked(page) && ext4_should_journal_data(inode)) /* @@ -2089,14 +2272,9 @@ static int ext4_writepage(struct page *page, */ return __ext4_journalled_writepage(page, len); - if (buffer_uninit(page_bufs)) { - ext4_set_bh_endio(page_bufs, inode); - ret = block_write_full_page_endio(page, noalloc_get_block_write, - wbc, ext4_end_io_buffer_write); - } else - ret = block_write_full_page(page, noalloc_get_block_write, - wbc); - + memset(&io_submit, 0, sizeof(io_submit)); + ret = ext4_bio_write_page(&io_submit, page, len, wbc); + ext4_io_submit(&io_submit); return ret; } @@ -2228,51 +2406,38 @@ static int write_cache_pages_da(handle_t *handle, logical = (sector_t) page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - if (!page_has_buffers(page)) { - mpage_add_bh_to_extent(mpd, logical, - PAGE_CACHE_SIZE, - (1 << BH_Dirty) | (1 << BH_Uptodate)); - if (mpd->io_done) - goto ret_extent_tail; - } else { + /* Add all dirty buffers to mpd */ + head = page_buffers(page); + bh = head; + do { + BUG_ON(buffer_locked(bh)); /* - * Page with regular buffer heads, - * just add all dirty ones + * We need to try to allocate unmapped blocks + * in the same page. Otherwise we won't make + * progress with the page in ext4_writepage */ - head = page_buffers(page); - bh = head; - do { - BUG_ON(buffer_locked(bh)); + if (ext4_bh_delay_or_unwritten(NULL, bh)) { + mpage_add_bh_to_extent(mpd, logical, + bh->b_state); + if (mpd->io_done) + goto ret_extent_tail; + } else if (buffer_dirty(bh) && + buffer_mapped(bh)) { /* - * We need to try to allocate - * unmapped blocks in the same page. - * Otherwise we won't make progress - * with the page in ext4_writepage + * mapped dirty buffer. We need to + * update the b_state because we look + * at b_state in mpage_da_map_blocks. + * We don't update b_size because if we + * find an unmapped buffer_head later + * we need to use the b_state flag of + * that buffer_head. */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) { - mpage_add_bh_to_extent(mpd, logical, - bh->b_size, - bh->b_state); - if (mpd->io_done) - goto ret_extent_tail; - } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { - /* - * mapped dirty buffer. We need - * to update the b_state - * because we look at b_state - * in mpage_da_map_blocks. We - * don't update b_size because - * if we find an unmapped - * buffer_head later we need to - * use the b_state flag of that - * buffer_head. - */ - if (mpd->b_size == 0) - mpd->b_state = bh->b_state & BH_FLAGS; - } - logical++; - } while ((bh = bh->b_this_page) != head); - } + if (mpd->b_size == 0) + mpd->b_state = + bh->b_state & BH_FLAGS; + } + logical++; + } while ((bh = bh->b_this_page) != head); if (nr_to_write > 0) { nr_to_write--; @@ -2413,7 +2578,8 @@ retry: needed_blocks = ext4_da_writepages_trans_blocks(inode); /* start a new transaction*/ - handle = ext4_journal_start(inode, needed_blocks); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " @@ -2512,12 +2678,8 @@ static int ext4_nonda_switch(struct super_block *sb) /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ - if (dirty_blocks && (free_blocks < 2 * dirty_blocks) && - !writeback_in_progress(sb->s_bdi) && - down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); - up_read(&sb->s_umount); - } + if (dirty_blocks && (free_blocks < 2 * dirty_blocks)) + try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); if (2 * free_blocks < 3 * dirty_blocks || free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { @@ -2555,42 +2717,52 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pos, len, flags, pagep, fsdata); if (ret < 0) - goto out; - if (ret == 1) { - ret = 0; - goto out; - } + return ret; + if (ret == 1) + return 0; } -retry: + /* + * grab_cache_page_write_begin() can take a long time if the + * system is thrashing due to memory pressure, or if the page + * is being written back. So grab it first before we start + * the transaction handle. This also allows us to allocate + * the page (if needed) without using GFP_NOFS. + */ +retry_grab: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + unlock_page(page); + /* * With delayed allocation, we don't log the i_disksize update * if there is delayed block allocation. But we still need * to journalling the i_disksize update if writes to the end * of file which has an already mapped buffer. */ - handle = ext4_journal_start(inode, 1); +retry_journal: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1); if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; + page_cache_release(page); + return PTR_ERR(handle); } - /* We cannot recurse into the filesystem as the transaction is already - * started */ - flags |= AOP_FLAG_NOFS; - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + unlock_page(page); + page_cache_release(page); ext4_journal_stop(handle); - ret = -ENOMEM; - goto out; + goto retry_grab; } - *pagep = page; + /* In case writeback began while the page was unlocked */ + wait_on_page_writeback(page); ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); if (ret < 0) { unlock_page(page); ext4_journal_stop(handle); - page_cache_release(page); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -2598,11 +2770,16 @@ retry: */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); + + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + + page_cache_release(page); + return ret; } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: + *pagep = page; return ret; } @@ -2858,36 +3035,10 @@ ext4_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } -static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) -{ - struct buffer_head *head, *bh; - unsigned int curr_off = 0; - - if (!page_has_buffers(page)) - return; - head = bh = page_buffers(page); - do { - if (offset <= curr_off && test_clear_buffer_uninit(bh) - && bh->b_private) { - ext4_free_io_end(bh->b_private); - bh->b_private = NULL; - bh->b_end_io = NULL; - } - curr_off = curr_off + bh->b_size; - bh = bh->b_this_page; - } while (bh != head); -} - static void ext4_invalidatepage(struct page *page, unsigned long offset) { trace_ext4_invalidatepage(page, offset); - /* - * free any io_end structure allocated for buffers to be discarded - */ - if (ext4_should_dioread_nolock(page->mapping->host)) - ext4_invalidatepage_free_endio(page, offset); - /* No journalling happens on data buffers when this function is used */ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); @@ -2923,8 +3074,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) trace_ext4_releasepage(page); - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) + /* Page has dirty journalled data -> cannot release */ + if (PageChecked(page)) return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page, wait); @@ -2959,7 +3110,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private, int ret, bool is_async) { - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(iocb->ki_filp); ext4_io_end_t *io_end = iocb->private; /* if not async direct IO or dio with 0 bytes write, just return */ @@ -2977,9 +3128,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { ext4_free_io_end(io_end); out: + inode_dio_done(inode); if (is_async) aio_complete(iocb, ret, 0); - inode_dio_done(inode); return; } @@ -2993,65 +3144,6 @@ out: ext4_add_complete_io(io_end); } -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) -{ - ext4_io_end_t *io_end = bh->b_private; - struct inode *inode; - - if (!test_clear_buffer_uninit(bh) || !io_end) - goto out; - - if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { - ext4_msg(io_end->inode->i_sb, KERN_INFO, - "sb umounted, discard end_io request for inode %lu", - io_end->inode->i_ino); - ext4_free_io_end(io_end); - goto out; - } - - /* - * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now, - * but being more careful is always safe for the future change. - */ - inode = io_end->inode; - ext4_set_io_unwritten_flag(inode, io_end); - ext4_add_complete_io(io_end); -out: - bh->b_private = NULL; - bh->b_end_io = NULL; - clear_buffer_uninit(bh); - end_buffer_async_write(bh, uptodate); -} - -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) -{ - ext4_io_end_t *io_end; - struct page *page = bh->b_page; - loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; - size_t size = bh->b_size; - -retry: - io_end = ext4_init_io_end(inode, GFP_ATOMIC); - if (!io_end) { - pr_warn_ratelimited("%s: allocation fail\n", __func__); - schedule(); - goto retry; - } - io_end->offset = offset; - io_end->size = size; - /* - * We need to hold a reference to the page to make sure it - * doesn't get evicted before ext4_end_io_work() has a chance - * to convert the extent from written to unwritten. - */ - io_end->page = page; - get_page(io_end->page); - - bh->b_private = io_end; - bh->b_end_io = ext4_end_io_buffer_write; - return 0; -} - /* * For ext4 extent files, ext4 will do direct-io write to holes, * preallocated extents, and those write extend the file, no need to @@ -3553,20 +3645,20 @@ int ext4_can_truncate(struct inode *inode) int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - /* TODO: Add support for non extent hole punching */ - return -EOPNOTSUPP; - } + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return ext4_ind_punch_hole(file, offset, length); if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { /* TODO: Add support for bigalloc file systems */ return -EOPNOTSUPP; } + trace_ext4_punch_hole(inode, offset, length); + return ext4_ext_punch_hole(file, offset, length); } @@ -3660,11 +3752,8 @@ static int __ext4_get_inode_loc(struct inode *inode, iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); bh = sb_getblk(sb, block); - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, block, - "unable to read itable block"); - return -EIO; - } + if (unlikely(!bh)) + return -ENOMEM; if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -3696,7 +3785,7 @@ static int __ext4_get_inode_loc(struct inode *inode, /* Is the inode bitmap in cache? */ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); - if (!bitmap_bh) + if (unlikely(!bitmap_bh)) goto make_io; /* @@ -4404,8 +4493,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ - EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -4440,7 +4530,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) (attr->ia_size < inode->i_size)) { handle_t *handle; - handle = ext4_journal_start(inode, 3); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -4460,7 +4550,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_size); if (error) { /* Do as much error cleanup as possible */ - handle = ext4_journal_start(inode, 3); + handle = ext4_journal_start(inode, + EXT4_HT_INODE, 3); if (IS_ERR(handle)) { ext4_orphan_del(NULL, inode); goto err_out; @@ -4801,7 +4892,7 @@ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; - handle = ext4_journal_start(inode, 2); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) goto out; @@ -4902,7 +4993,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) /* Finally we can mark the inode as dirty. */ - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4926,7 +5017,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long len; int ret; struct file *file = vma->vm_file; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; handle_t *handle; get_block_t *get_block; @@ -4968,7 +5059,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ - wait_on_page_writeback(page); + wait_for_stable_page(page); ret = VM_FAULT_LOCKED; goto out; } @@ -4980,7 +5071,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) else get_block = ext4_get_block; retry_alloc: - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = VM_FAULT_SIGBUS; goto out; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5747f52f7c72..721f4d33e148 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -22,7 +22,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; @@ -104,7 +104,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } else if (oldflags & EXT4_EOFBLOCKS_FL) ext4_truncate(inode); - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto flags_out; @@ -173,7 +173,7 @@ flags_out: } mutex_lock(&inode->i_mutex); - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto unlock_out; @@ -313,6 +313,9 @@ mext_out: if (err == 0) err = err2; mnt_drop_write_file(filp); + if (!err && ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, input.group); group_add_out: ext4_resize_end(sb); return err; @@ -358,6 +361,7 @@ group_add_out: ext4_fsblk_t n_blocks_count; struct super_block *sb = inode->i_sb; int err = 0, err2 = 0; + ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { @@ -388,6 +392,11 @@ group_add_out: if (err == 0) err = err2; mnt_drop_write_file(filp); + if (!err && (o_group > EXT4_SB(sb)->s_groups_count) && + ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, o_group); + resizefs_out: ext4_resize_end(sb); return err; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1bf6fe785c4f..ee6614bdb639 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -23,11 +23,18 @@ #include "ext4_jbd2.h" #include "mballoc.h" -#include <linux/debugfs.h> #include <linux/log2.h> +#include <linux/module.h> #include <linux/slab.h> #include <trace/events/ext4.h> +#ifdef CONFIG_EXT4_DEBUG +ushort ext4_mballoc_debug __read_mostly; + +module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); +MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); +#endif + /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() @@ -1884,15 +1891,19 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, case 0: BUG_ON(ac->ac_2order == 0); - if (grp->bb_largest_free_order < ac->ac_2order) - return 0; - /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) return 0; + if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || + (free / fragments) >= ac->ac_g_ex.fe_len) + return 1; + + if (grp->bb_largest_free_order < ac->ac_2order) + return 0; + return 1; case 1: if ((free / fragments) >= ac->ac_g_ex.fe_len) @@ -2007,7 +2018,7 @@ repeat: } ac->ac_groups_scanned++; - if (cr == 0) + if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && sbi->s_stripe && !(ac->ac_g_ex.fe_len % sbi->s_stripe)) @@ -2656,40 +2667,6 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } -#ifdef CONFIG_EXT4_DEBUG -u8 mb_enable_debug __read_mostly; - -static struct dentry *debugfs_dir; -static struct dentry *debugfs_debug; - -static void __init ext4_create_debugfs_entry(void) -{ - debugfs_dir = debugfs_create_dir("ext4", NULL); - if (debugfs_dir) - debugfs_debug = debugfs_create_u8("mballoc-debug", - S_IRUGO | S_IWUSR, - debugfs_dir, - &mb_enable_debug); -} - -static void ext4_remove_debugfs_entry(void) -{ - debugfs_remove(debugfs_debug); - debugfs_remove(debugfs_dir); -} - -#else - -static void __init ext4_create_debugfs_entry(void) -{ -} - -static void ext4_remove_debugfs_entry(void) -{ -} - -#endif - int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, @@ -2711,7 +2688,6 @@ int __init ext4_init_mballoc(void) kmem_cache_destroy(ext4_ac_cachep); return -ENOMEM; } - ext4_create_debugfs_entry(); return 0; } @@ -2726,7 +2702,6 @@ void ext4_exit_mballoc(void) kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); - ext4_remove_debugfs_entry(); } @@ -2829,8 +2804,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); - atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -3444,7 +3419,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) win = offs; ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - - EXT4_B2C(sbi, win); + EXT4_NUM_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); } @@ -3717,11 +3692,7 @@ repeat: if (free < needed && busy) { busy = 0; ext4_unlock_group(sb, group); - /* - * Yield the CPU here so that we don't get soft lockup - * in non preempt case. - */ - yield(); + cond_resched(); goto repeat; } @@ -3872,7 +3843,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) struct super_block *sb = ac->ac_sb; ext4_group_t ngroups, i; - if (!mb_enable_debug || + if (!ext4_mballoc_debug || (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; @@ -4005,8 +3976,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, len = ar->len; /* just a dirty hack to filter too big requests */ - if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10) - len = EXT4_CLUSTERS_PER_GROUP(sb) - 10; + if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) + len = EXT4_CLUSTERS_PER_GROUP(sb); /* start searching from the goal */ goal = ar->goal; @@ -4136,7 +4107,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; /* Add the prealloc space to lg */ - rcu_read_lock(); + spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], pa_inode_list) { spin_lock(&tmp_pa->pa_lock); @@ -4160,12 +4131,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) if (!added) list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list[order]); - rcu_read_unlock(); + spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ if (lg_prealloc_count > 8) { ext4_mb_discard_lg_preallocations(sb, lg, - order, lg_prealloc_count); + order, lg_prealloc_count); return; } return ; @@ -4271,7 +4242,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ - yield(); + cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { @@ -4489,7 +4460,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_group_desc *gdp; - unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; @@ -4590,7 +4560,7 @@ do_more: EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } - count_clusters = EXT4_B2C(sbi, count); + count_clusters = EXT4_NUM_B2C(sbi, count); bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) { err = -EIO; @@ -4691,14 +4661,12 @@ do_more: if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count_clusters, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); - freed += count; - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); @@ -4832,12 +4800,12 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_B2C(sbi, blocks_freed)); + EXT4_NUM_B2C(sbi, blocks_freed)); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(EXT4_B2C(sbi, blocks_freed), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 3ccd889ba953..08481ee84cd5 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -37,11 +37,11 @@ /* */ #ifdef CONFIG_EXT4_DEBUG -extern u8 mb_enable_debug; +extern ushort ext4_mballoc_debug; #define mb_debug(n, fmt, a...) \ do { \ - if ((n) <= mb_enable_debug) { \ + if ((n) <= ext4_mballoc_debug) { \ printk(KERN_DEBUG "(%s, %d): %s: ", \ __FILE__, __LINE__, __func__); \ printk(fmt, ## a); \ diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index db8226d595fa..480acf4a085f 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -456,11 +456,14 @@ int ext4_ext_migrate(struct inode *inode) */ return retval; - handle = ext4_journal_start(inode, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) - + 1); + /* + * Worst case we can touch the allocation bitmaps, a bgd + * block, and a block to link in the orphan list. We do need + * need to worry about credits for modifying the quota inode. + */ + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, + 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { retval = PTR_ERR(handle); return retval; @@ -507,7 +510,7 @@ int ext4_ext_migrate(struct inode *inode) ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { /* * It is impossible to update on-disk structures without diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index fe7c63f4717e..f9b551561d2c 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -80,6 +80,8 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, * is not blocked in the elevator. */ if (!*bh) *bh = sb_getblk(sb, mmp_block); + if (!*bh) + return -ENOMEM; if (*bh) { get_bh(*bh); lock_buffer(*bh); @@ -91,7 +93,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, *bh = NULL; } } - if (!*bh) { + if (unlikely(!*bh)) { ext4_warning(sb, "Error while reading MMP block %llu", mmp_block); return -EIO; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index d9cc5ee42f53..33e1c086858b 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,16 +32,18 @@ */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **path) + struct ext4_ext_path **orig_path) { int ret = 0; + struct ext4_ext_path *path; - *path = ext4_ext_find_extent(inode, lblock, *path); - if (IS_ERR(*path)) { - ret = PTR_ERR(*path); - *path = NULL; - } else if ((*path)[ext_depth(inode)].p_ext == NULL) + path = ext4_ext_find_extent(inode, lblock, *orig_path); + if (IS_ERR(path)) + ret = PTR_ERR(path); + else if (path[ext_depth(inode)].p_ext == NULL) ret = -ENODATA; + else + *orig_path = path; return ret; } @@ -611,24 +613,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; + int ret = 0; ext4_lblk_t last = from + count; while (from < last) { *err = get_ext_path(inode, from, &path); if (*err) - return 0; + goto out; ext = path[ext_depth(inode)].p_ext; - if (!ext) { - ext4_ext_drop_refs(path); - return 0; - } - if (uninit != ext4_ext_is_uninitialized(ext)) { - ext4_ext_drop_refs(path); - return 0; - } + if (uninit != ext4_ext_is_uninitialized(ext)) + goto out; from += ext4_ext_get_actual_len(ext); ext4_ext_drop_refs(path); } - return 1; + ret = 1; +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return ret; } /** @@ -666,6 +669,14 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; + *err = ext4_es_remove_extent(orig_inode, from, count); + if (*err) + goto out; + + *err = ext4_es_remove_extent(donor_inode, from, count); + if (*err) + goto out; + /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) @@ -681,6 +692,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, depth = ext_depth(donor_inode); dext = donor_path[depth].p_ext; + if (unlikely(!dext)) + goto missing_donor_extent; tmp_dext = *dext; *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, @@ -691,7 +704,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, /* Loop for the donor extents */ while (1) { /* The extent for donor must be found. */ - if (!dext) { + if (unlikely(!dext)) { + missing_donor_extent: EXT4_ERROR_INODE(donor_inode, "The extent for donor must be found"); *err = -EIO; @@ -761,9 +775,6 @@ out: kfree(donor_path); } - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); - return replaced_count; } @@ -900,7 +911,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, int data_offset_in_page, int block_len_in_page, int uninit, int *err) { - struct inode *orig_inode = o_filp->f_dentry->d_inode; + struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset; @@ -920,7 +931,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, again: *err = 0; jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; - handle = ext4_journal_start(orig_inode, jblocks); + handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); if (IS_ERR(handle)) { *err = PTR_ERR(handle); return 0; @@ -1279,8 +1290,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_start, __u64 donor_start, __u64 len, __u64 *moved_len) { - struct inode *orig_inode = o_filp->f_dentry->d_inode; - struct inode *donor_inode = d_filp->f_dentry->d_inode; + struct inode *orig_inode = file_inode(o_filp); + struct inode *donor_inode = file_inode(d_filp); struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL; struct ext4_extent *ext_prev, *ext_cur, *ext_dummy; ext4_lblk_t block_start = orig_start; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f9ed946a448e..3825d6aa8336 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -47,38 +47,111 @@ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) static struct buffer_head *ext4_append(handle_t *handle, struct inode *inode, - ext4_lblk_t *block, int *err) + ext4_lblk_t *block) { struct buffer_head *bh; + int err = 0; if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && ((inode->i_size >> 10) >= - EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) { - *err = -ENOSPC; - return NULL; - } + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) + return ERR_PTR(-ENOSPC); *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - bh = ext4_bread(handle, inode, *block, 1, err); - if (bh) { - inode->i_size += inode->i_sb->s_blocksize; - EXT4_I(inode)->i_disksize = inode->i_size; - *err = ext4_journal_get_write_access(handle, bh); - if (*err) { + bh = ext4_bread(handle, inode, *block, 1, &err); + if (!bh) + return ERR_PTR(err); + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + ext4_std_error(inode->i_sb, err); + return ERR_PTR(err); + } + return bh; +} + +static int ext4_dx_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent); + +typedef enum { + EITHER, INDEX, DIRENT +} dirblock_type_t; + +#define ext4_read_dirblock(inode, block, type) \ + __ext4_read_dirblock((inode), (block), (type), __LINE__) + +static struct buffer_head *__ext4_read_dirblock(struct inode *inode, + ext4_lblk_t block, + dirblock_type_t type, + unsigned int line) +{ + struct buffer_head *bh; + struct ext4_dir_entry *dirent; + int err = 0, is_dx_block = 0; + + bh = ext4_bread(NULL, inode, block, 0, &err); + if (!bh) { + if (err == 0) { + ext4_error_inode(inode, __func__, line, block, + "Directory hole found"); + return ERR_PTR(-EIO); + } + __ext4_warning(inode->i_sb, __func__, line, + "error reading directory block " + "(ino %lu, block %lu)", inode->i_ino, + (unsigned long) block); + return ERR_PTR(err); + } + dirent = (struct ext4_dir_entry *) bh->b_data; + /* Determine whether or not we have an index block */ + if (is_dx(inode)) { + if (block == 0) + is_dx_block = 1; + else if (ext4_rec_len_from_disk(dirent->rec_len, + inode->i_sb->s_blocksize) == + inode->i_sb->s_blocksize) + is_dx_block = 1; + } + if (!is_dx_block && type == INDEX) { + ext4_error_inode(inode, __func__, line, block, + "directory leaf block found instead of index block"); + return ERR_PTR(-EIO); + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) || + buffer_verified(bh)) + return bh; + + /* + * An empty leaf block can get mistaken for a index block; for + * this reason, we can only check the index checksum when the + * caller is sure it should be an index block. + */ + if (is_dx_block && type == INDEX) { + if (ext4_dx_csum_verify(inode, dirent)) + set_buffer_verified(bh); + else { + ext4_error_inode(inode, __func__, line, block, + "Directory index failed checksum"); brelse(bh); - bh = NULL; + return ERR_PTR(-EIO); } } - if (!bh && !(*err)) { - *err = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); + if (!is_dx_block) { + if (ext4_dirent_csum_verify(inode, dirent)) + set_buffer_verified(bh); + else { + ext4_error_inode(inode, __func__, line, block, + "Directory block failed checksum"); + brelse(bh); + return ERR_PTR(-EIO); + } } return bh; } @@ -604,9 +677,9 @@ dx_probe(const struct qstr *d_name, struct inode *dir, u32 hash; frame->bh = NULL; - if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) { - if (*err == 0) - *err = ERR_BAD_DX_DIR; + bh = ext4_read_dirblock(dir, 0, INDEX); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); goto fail; } root = (struct dx_root *) bh->b_data; @@ -643,15 +716,6 @@ dx_probe(const struct qstr *d_name, struct inode *dir, goto fail; } - if (!buffer_verified(bh) && - !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) { - ext4_warning(dir->i_sb, "Root failed checksum"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - set_buffer_verified(bh); - entries = (struct dx_entry *) (((char *)&root->info) + root->info.info_length); @@ -709,22 +773,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir, frame->entries = entries; frame->at = at; if (!indirect--) return frame; - if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) { - if (!(*err)) - *err = ERR_BAD_DX_DIR; - goto fail2; - } - at = entries = ((struct dx_node *) bh->b_data)->entries; - - if (!buffer_verified(bh) && - !ext4_dx_csum_verify(dir, - (struct ext4_dir_entry *)bh->b_data)) { - ext4_warning(dir->i_sb, "Node failed checksum"); - brelse(bh); - *err = ERR_BAD_DX_DIR; + bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); goto fail2; } - set_buffer_verified(bh); + entries = ((struct dx_node *) bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { ext4_warning(dir->i_sb, @@ -783,7 +837,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, { struct dx_frame *p; struct buffer_head *bh; - int err, num_frames = 0; + int num_frames = 0; __u32 bhash; p = frame; @@ -822,25 +876,9 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, * block so no check is necessary */ while (num_frames--) { - if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), - 0, &err))) { - if (!err) { - ext4_error(dir->i_sb, - "Directory hole detected on inode %lu\n", - dir->i_ino); - return -EIO; - } - return err; /* Failure */ - } - - if (!buffer_verified(bh) && - !ext4_dx_csum_verify(dir, - (struct ext4_dir_entry *)bh->b_data)) { - ext4_warning(dir->i_sb, "Node failed checksum"); - return -EIO; - } - set_buffer_verified(bh); - + bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); + if (IS_ERR(bh)) + return PTR_ERR(bh); p++; brelse(p->bh); p->bh = bh; @@ -866,20 +904,9 @@ static int htree_dirblock_to_tree(struct file *dir_file, dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); - if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) { - if (!err) { - err = -EIO; - ext4_error(dir->i_sb, - "Directory hole detected on inode %lu\n", - dir->i_ino); - } - return err; - } - - if (!buffer_verified(bh) && - !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) - return -EIO; - set_buffer_verified(bh); + bh = ext4_read_dirblock(dir, block, DIRENT); + if (IS_ERR(bh)) + return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; top = (struct ext4_dir_entry_2 *) ((char *) de + @@ -937,7 +964,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); - dir = dir_file->f_path.dentry->d_inode; + dir = file_inode(dir_file); if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) @@ -1333,26 +1360,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q return NULL; do { block = dx_get_block(frame->at); - if (!(bh = ext4_bread(NULL, dir, block, 0, err))) { - if (!(*err)) { - *err = -EIO; - ext4_error(dir->i_sb, - "Directory hole detected on inode %lu\n", - dir->i_ino); - } - goto errout; - } - - if (!buffer_verified(bh) && - !ext4_dirent_csum_verify(dir, - (struct ext4_dir_entry *)bh->b_data)) { - EXT4_ERROR_INODE(dir, "checksumming directory " - "block %lu", (unsigned long)block); - brelse(bh); - *err = -EIO; + bh = ext4_read_dirblock(dir, block, DIRENT); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); goto errout; } - set_buffer_verified(bh); retval = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); @@ -1536,11 +1548,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) csum_size = sizeof(struct ext4_dir_entry_tail); - bh2 = ext4_append (handle, dir, &newblock, &err); - if (!(bh2)) { + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; - goto errout; + *error = PTR_ERR(bh2); + return NULL; } BUFFER_TRACE(*bh, "get_write_access"); @@ -1621,7 +1634,6 @@ journal_error: brelse(bh2); *bh = NULL; ext4_std_error(dir->i_sb, err); -errout: *error = err; return NULL; } @@ -1699,7 +1711,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned int blocksize = dir->i_sb->s_blocksize; - unsigned short reclen; int csum_size = 0; int err; @@ -1707,7 +1718,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) csum_size = sizeof(struct ext4_dir_entry_tail); - reclen = EXT4_DIR_REC_LEN(namelen); if (!de) { err = ext4_find_dest_de(dir, inode, bh, bh->b_data, blocksize - csum_size, @@ -1798,10 +1808,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, len = ((char *) root) + (blocksize - csum_size) - (char *) de; /* Allocate new block for the 0th block's dirents */ - bh2 = ext4_append(handle, dir, &block, &retval); - if (!(bh2)) { + bh2 = ext4_append(handle, dir, &block); + if (IS_ERR(bh2)) { brelse(bh); - return retval; + return PTR_ERR(bh2); } ext4_set_inode_flag(dir, EXT4_INODE_INDEX); data1 = bh2->b_data; @@ -1918,20 +1928,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { - if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) { - if (!retval) { - retval = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); - } - return retval; - } - if (!buffer_verified(bh) && - !ext4_dirent_csum_verify(dir, - (struct ext4_dir_entry *)bh->b_data)) - return -EIO; - set_buffer_verified(bh); + bh = ext4_read_dirblock(dir, block, DIRENT); + if (IS_ERR(bh)) + return PTR_ERR(bh); + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); if (retval != -ENOSPC) { brelse(bh); @@ -1943,9 +1943,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return make_indexed_dir(handle, dentry, inode, bh); brelse(bh); } - bh = ext4_append(handle, dir, &block, &retval); - if (!bh) - return retval; + bh = ext4_append(handle, dir, &block); + if (IS_ERR(bh)) + return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); @@ -1982,22 +1982,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, return err; entries = frame->entries; at = frame->at; - - if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) { - if (!err) { - err = -EIO; - ext4_error(dir->i_sb, - "Directory hole detected on inode %lu\n", - dir->i_ino); - } + bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; goto cleanup; } - if (!buffer_verified(bh) && - !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) - goto journal_error; - set_buffer_verified(bh); - BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); if (err) @@ -2025,9 +2016,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, err = -ENOSPC; goto cleanup; } - bh2 = ext4_append (handle, dir, &newblock, &err); - if (!(bh2)) + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { + err = PTR_ERR(bh2); goto cleanup; + } node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; memset(&node2->fake, 0, sizeof(struct fake_dirent)); @@ -2106,8 +2099,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, journal_error: ext4_std_error(dir->i_sb, err); cleanup: - if (bh) - brelse(bh); + brelse(bh); dx_release(frames); return err; } @@ -2254,29 +2246,28 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, { handle_t *handle; struct inode *inode; - int err, retries = 0; + int err, credits, retries = 0; dquot_initialize(dir); + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); + inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, + NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, inode); + if (!err && IS_DIRSYNC(dir)) + ext4_handle_sync(handle); } - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2287,31 +2278,30 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, { handle_t *handle; struct inode *inode; - int err, retries = 0; + int err, credits, retries = 0; if (!new_valid_dev(rdev)) return -EINVAL; dquot_initialize(dir); + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); + inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, + NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; err = ext4_add_nondir(handle, dentry, inode); + if (!err && IS_DIRSYNC(dir)) + ext4_handle_sync(handle); } - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2351,6 +2341,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; + ext4_lblk_t block = 0; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err; @@ -2367,16 +2358,10 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, goto out; } - inode->i_size = EXT4_I(inode)->i_disksize = blocksize; - if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { - if (!err) { - err = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); - } - goto out; - } + inode->i_size = 0; + dir_block = ext4_append(handle, inode, &block); + if (IS_ERR(dir_block)) + return PTR_ERR(dir_block); BUFFER_TRACE(dir_block, "get_write_access"); err = ext4_journal_get_write_access(handle, dir_block); if (err) @@ -2403,25 +2388,21 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; - int err, retries = 0; + int err, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; dquot_initialize(dir); + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, S_IFDIR | mode, - &dentry->d_name, 0, NULL); + inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, + &dentry->d_name, + 0, NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2449,8 +2430,12 @@ out_clear_inode: goto out_clear_inode; unlock_new_inode(inode); d_instantiate(dentry, inode); + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + out_stop: - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2476,25 +2461,14 @@ static int empty_dir(struct inode *inode) } sb = inode->i_sb; - if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || - !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { - if (err) - EXT4_ERROR_INODE(inode, - "error %d reading directory lblock 0", err); - else - ext4_warning(inode->i_sb, - "bad directory (dir #%lu) - no data block", - inode->i_ino); + if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { + EXT4_ERROR_INODE(inode, "invalid size"); return 1; } - if (!buffer_verified(bh) && - !ext4_dirent_csum_verify(inode, - (struct ext4_dir_entry *)bh->b_data)) { - EXT4_ERROR_INODE(inode, "checksum error reading directory " - "lblock 0"); - return -EIO; - } - set_buffer_verified(bh); + bh = ext4_read_dirblock(inode, 0, EITHER); + if (IS_ERR(bh)) + return 1; + de = (struct ext4_dir_entry_2 *) bh->b_data; de1 = ext4_next_entry(de, sb->s_blocksize); if (le32_to_cpu(de->inode) != inode->i_ino || @@ -2517,28 +2491,9 @@ static int empty_dir(struct inode *inode) err = 0; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); - bh = ext4_bread(NULL, inode, lblock, 0, &err); - if (!bh) { - if (err) - EXT4_ERROR_INODE(inode, - "error %d reading directory " - "lblock %u", err, lblock); - else - ext4_warning(inode->i_sb, - "bad directory (dir #%lu) - no data block", - inode->i_ino); - - offset += sb->s_blocksize; - continue; - } - if (!buffer_verified(bh) && - !ext4_dirent_csum_verify(inode, - (struct ext4_dir_entry *)bh->b_data)) { - EXT4_ERROR_INODE(inode, "checksum error " - "reading directory lblock 0"); - return -EIO; - } - set_buffer_verified(bh); + bh = ext4_read_dirblock(inode, lblock, EITHER); + if (IS_ERR(bh)) + return 1; de = (struct ext4_dir_entry_2 *) bh->b_data; } if (ext4_check_dir_entry(inode, NULL, de, bh, @@ -2717,25 +2672,18 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct inode *inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de; - handle_t *handle; + handle_t *handle = NULL; /* Initialize quotas before so that eventual writes go in * separate transaction */ dquot_initialize(dir); dquot_initialize(dentry->d_inode); - handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (!bh) goto end_rmdir; - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - inode = dentry->d_inode; retval = -EIO; @@ -2746,6 +2694,17 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (!empty_dir(inode)) goto end_rmdir; + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_rmdir; + } + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_rmdir; @@ -2767,8 +2726,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) ext4_mark_inode_dirty(handle, dir); end_rmdir: - ext4_journal_stop(handle); brelse(bh); + if (handle) + ext4_journal_stop(handle); return retval; } @@ -2778,7 +2738,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de; - handle_t *handle; + handle_t *handle = NULL; trace_ext4_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go @@ -2786,13 +2746,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) dquot_initialize(dir); dquot_initialize(dentry->d_inode); - handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (!bh) @@ -2804,6 +2757,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (le32_to_cpu(de->inode) != inode->i_ino) goto end_unlink; + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_unlink; + } + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + if (!inode->i_nlink) { ext4_warning(inode->i_sb, "Deleting nonexistent file (%lu), %d", @@ -2824,8 +2788,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = 0; end_unlink: - ext4_journal_stop(handle); brelse(bh); + if (handle) + ext4_journal_stop(handle); trace_ext4_unlink_exit(dentry, retval); return retval; } @@ -2865,15 +2830,10 @@ static int ext4_symlink(struct inode *dir, EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); } retry: - handle = ext4_journal_start(dir, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, - &dentry->d_name, 0, NULL); + inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, + &dentry->d_name, 0, NULL, + EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2903,7 +2863,7 @@ retry: * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified */ - handle = ext4_journal_start(dir, + handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); if (IS_ERR(handle)) { @@ -2926,8 +2886,12 @@ retry: } EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_add_nondir(handle, dentry, inode); + if (!err && IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + out_stop: - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2950,8 +2914,9 @@ static int ext4_link(struct dentry *old_dentry, dquot_initialize(dir); retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS); + handle = ext4_journal_start(dir, EXT4_HT_DIR, + (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2991,13 +2956,9 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, struct buffer_head *bh; if (!ext4_has_inline_data(inode)) { - if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) { - if (!*retval) { - *retval = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); - } + bh = ext4_read_dirblock(inode, 0, EITHER); + if (IS_ERR(bh)) { + *retval = PTR_ERR(bh); return NULL; } *parent_de = ext4_next_entry( @@ -3034,9 +2995,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, * in separate transaction */ if (new_dentry->d_inode) dquot_initialize(new_dentry->d_inode); - handle = ext4_journal_start(old_dir, 2 * - EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); + handle = ext4_journal_start(old_dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -3076,11 +3037,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, &inlined); if (!dir_bh) goto end_rename; - if (!inlined && !buffer_verified(dir_bh) && - !ext4_dirent_csum_verify(old_inode, - (struct ext4_dir_entry *)dir_bh->b_data)) - goto end_rename; - set_buffer_verified(dir_bh); if (le32_to_cpu(parent_de->inode) != old_dir->i_ino) goto end_rename; retval = -EMLINK; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 0016fbca2a40..047a6de04a0a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -23,6 +23,7 @@ #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/mm.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -49,11 +50,21 @@ void ext4_exit_pageio(void) kmem_cache_destroy(io_page_cachep); } -void ext4_ioend_wait(struct inode *inode) +/* + * This function is called by ext4_evict_inode() to make sure there is + * no more pending I/O completion work left to do. + */ +void ext4_ioend_shutdown(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); + /* + * We need to make sure the work structure is finished being + * used before we let the inode get destroyed. + */ + if (work_pending(&EXT4_I(inode)->i_unwritten_work)) + cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); } static void put_io_page(struct ext4_io_page *io_page) @@ -73,8 +84,6 @@ void ext4_free_io_end(ext4_io_end_t *io) BUG_ON(!list_empty(&io->list)); BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); - if (io->page) - put_page(io->page); for (i = 0; i < io->num_io_pages; i++) put_io_page(io->pages[i]); io->num_io_pages = 0; @@ -103,14 +112,13 @@ static int ext4_end_io(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - if (io->iocb) - aio_complete(io->iocb, io->result, 0); - - if (io->flag & EXT4_IO_END_DIRECT) - inode_dio_done(inode); /* Wake up anyone waiting on unwritten extent conversion */ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) wake_up_all(ext4_ioend_wq(inode)); + if (io->flag & EXT4_IO_END_DIRECT) + inode_dio_done(inode); + if (io->iocb) + aio_complete(io->iocb, io->result, 0); return ret; } @@ -119,7 +127,6 @@ static void dump_completed_IO(struct inode *inode) #ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; - unsigned long flags; if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { ext4_debug("inode %lu completed_io list is empty\n", @@ -152,26 +159,20 @@ void ext4_add_complete_io(ext4_io_end_t *io_end) wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (list_empty(&ei->i_completed_io_list)) { - io_end->flag |= EXT4_IO_END_QUEUED; - queue_work(wq, &io_end->work); - } + if (list_empty(&ei->i_completed_io_list)) + queue_work(wq, &ei->i_unwritten_work); list_add_tail(&io_end->list, &ei->i_completed_io_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); } -static int ext4_do_flush_completed_IO(struct inode *inode, - ext4_io_end_t *work_io) +static int ext4_do_flush_completed_IO(struct inode *inode) { ext4_io_end_t *io; - struct list_head unwritten, complete, to_free; + struct list_head unwritten; unsigned long flags; struct ext4_inode_info *ei = EXT4_I(inode); int err, ret = 0; - INIT_LIST_HEAD(&complete); - INIT_LIST_HEAD(&to_free); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); dump_completed_IO(inode); list_replace_init(&ei->i_completed_io_list, &unwritten); @@ -185,32 +186,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode, err = ext4_end_io(io); if (unlikely(!ret && err)) ret = err; - - list_add_tail(&io->list, &complete); - } - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - while (!list_empty(&complete)) { - io = list_entry(complete.next, ext4_io_end_t, list); io->flag &= ~EXT4_IO_END_UNWRITTEN; - /* end_io context can not be destroyed now because it still - * used by queued worker. Worker thread will destroy it later */ - if (io->flag & EXT4_IO_END_QUEUED) - list_del_init(&io->list); - else - list_move(&io->list, &to_free); - } - /* If we are called from worker context, it is time to clear queued - * flag, and destroy it's end_io if it was converted already */ - if (work_io) { - work_io->flag &= ~EXT4_IO_END_QUEUED; - if (!(work_io->flag & EXT4_IO_END_UNWRITTEN)) - list_add_tail(&work_io->list, &to_free); - } - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - - while (!list_empty(&to_free)) { - io = list_entry(to_free.next, ext4_io_end_t, list); - list_del_init(&io->list); ext4_free_io_end(io); } return ret; @@ -219,10 +195,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode, /* * work on completed aio dio IO, to convert unwritten extents to extents */ -static void ext4_end_io_work(struct work_struct *work) +void ext4_end_io_work(struct work_struct *work) { - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - ext4_do_flush_completed_IO(io->inode, io); + struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, + i_unwritten_work); + ext4_do_flush_completed_IO(&ei->vfs_inode); } int ext4_flush_unwritten_io(struct inode *inode) @@ -230,7 +207,7 @@ int ext4_flush_unwritten_io(struct inode *inode) int ret; WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && !(inode->i_state & I_FREEING)); - ret = ext4_do_flush_completed_IO(inode, NULL); + ret = ext4_do_flush_completed_IO(inode); ext4_unwritten_wait(inode); return ret; } @@ -241,7 +218,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) if (io) { atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; - INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } return io; @@ -382,14 +358,6 @@ static int io_submit_add_bh(struct ext4_io_submit *io, unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); } - if (!buffer_mapped(bh) || buffer_delay(bh)) { - if (!buffer_mapped(bh)) - clear_buffer_dirty(bh); - if (io->io_bio) - ext4_io_submit(io); - return 0; - } - if (io->io_bio && bh->b_blocknr != io->io_next_block) { submit_and_retry: ext4_io_submit(io); @@ -436,7 +404,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); if (!io_page) { - set_page_dirty(page); + redirty_page_for_writepage(wbc, page); unlock_page(page); return -ENOMEM; } @@ -468,7 +436,15 @@ int ext4_bio_write_page(struct ext4_io_submit *io, set_buffer_uptodate(bh); continue; } - clear_buffer_dirty(bh); + if (!buffer_dirty(bh) || buffer_delay(bh) || + !buffer_mapped(bh) || buffer_unwritten(bh)) { + /* A hole? We can safely clear the dirty bit */ + if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + if (io->io_bio) + ext4_io_submit(io); + continue; + } ret = io_submit_add_bh(io, io_page, inode, wbc, bh); if (ret) { /* @@ -476,9 +452,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * we can do but mark the page as dirty, and * better luck next time. */ - set_page_dirty(page); + redirty_page_for_writepage(wbc, page); break; } + clear_buffer_dirty(bh); } unlock_page(page); /* diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index d99387b89edd..c169477a62c9 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -333,8 +333,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, int err; bh = sb_getblk(sb, blk); - if (!bh) - return ERR_PTR(-EIO); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); if ((err = ext4_journal_get_write_access(handle, bh))) { brelse(bh); bh = ERR_PTR(err); @@ -410,8 +410,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, return err; bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); - if (!bh) - return -EIO; + if (unlikely(!bh)) + return -ENOMEM; err = ext4_journal_get_write_access(handle, bh); if (err) @@ -466,7 +466,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); /* This transaction may be extended/restarted along the way */ - handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -500,8 +500,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, goto out; gdb = sb_getblk(sb, block); - if (!gdb) { - err = -EIO; + if (unlikely(!gdb)) { + err = -ENOMEM; goto out; } @@ -1031,7 +1031,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data, handle_t *handle; int err = 0, err2; - handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); if (IS_ERR(handle)) { group = 1; err = PTR_ERR(handle); @@ -1064,8 +1064,8 @@ static void update_backups(struct super_block *sb, int blk_off, char *data, ext4_bg_has_super(sb, group)); bh = sb_getblk(sb, backup_block); - if (!bh) { - err = -EIO; + if (unlikely(!bh)) { + err = -ENOMEM; break; } ext4_debug("update metadata backup %llu(+%llu)\n", @@ -1168,7 +1168,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) { struct buffer_head *bh = sb_getblk(sb, block); - if (!bh) + if (unlikely(!bh)) return NULL; if (!bh_uptodate_or_lock(bh)) { if (bh_submit_read(bh) < 0) { @@ -1247,7 +1247,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, ext4_inode_table_set(sb, gdp, group_data->inode_table); ext4_free_group_clusters_set(sb, gdp, - EXT4_B2C(sbi, group_data->free_blocks_count)); + EXT4_NUM_B2C(sbi, group_data->free_blocks_count)); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); if (ext4_has_group_desc_csum(sb)) ext4_itable_unused_set(sb, gdp, @@ -1349,7 +1349,7 @@ static void ext4_update_super(struct super_block *sb, /* Update the free space counts */ percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_B2C(sbi, free_blocks)); + EXT4_NUM_B2C(sbi, free_blocks)); percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); @@ -1360,8 +1360,8 @@ static void ext4_update_super(struct super_block *sb, sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, group_data[0].group); - atomic_add(EXT4_B2C(sbi, free_blocks), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, &sbi->s_flex_groups[flex_group].free_inodes); } @@ -1412,7 +1412,7 @@ static int ext4_flex_group_add(struct super_block *sb, * modify each of the reserved GDT dindirect blocks. */ credit = flex_gd->count * 4 + reserved_gdb; - handle = ext4_journal_start_sb(sb, credit); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto exit; @@ -1506,10 +1506,12 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, group_data[i].blocks_count = blocks_per_group; overhead = ext4_group_overhead_blocks(sb, group + i); group_data[i].free_blocks_count = blocks_per_group - overhead; - if (ext4_has_group_desc_csum(sb)) + if (ext4_has_group_desc_csum(sb)) { flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | EXT4_BG_INODE_UNINIT; - else + if (!test_opt(sb, INIT_INODE_TABLE)) + flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED; + } else flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; } @@ -1594,7 +1596,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) err = ext4_alloc_flex_bg_array(sb, input->group + 1); if (err) - return err; + goto out; err = ext4_mb_alloc_groupinfo(sb, input->group + 1); if (err) @@ -1622,7 +1624,7 @@ static int ext4_group_extend_no_check(struct super_block *sb, /* We will update the superblock, one block bitmap, and * one group descriptor via ext4_group_add_blocks(). */ - handle = ext4_journal_start_sb(sb, 3); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_warning(sb, "error %d on journal start", err); @@ -1786,7 +1788,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) credits += 3; /* block bitmap, bg descriptor, resize inode */ } - handle = ext4_journal_start_sb(sb, credits); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3d4fb81bacd5..5d6d53578124 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -69,8 +69,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb, static void ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); -static const char *ext4_decode_error(struct super_block *sb, int errno, - char nbuf[16]); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); @@ -92,6 +90,8 @@ static struct file_system_type ext2_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext2"); +MODULE_ALIAS("ext2"); #define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) @@ -106,6 +106,8 @@ static struct file_system_type ext3_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext3"); +MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) #else #define IS_EXT3_SB(sb) (0) @@ -296,107 +298,6 @@ void ext4_itable_unused_set(struct super_block *sb, } -/* Just increment the non-pointer handle value */ -static handle_t *ext4_get_nojournal(void) -{ - handle_t *handle = current->journal_info; - unsigned long ref_cnt = (unsigned long)handle; - - BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); - - ref_cnt++; - handle = (handle_t *)ref_cnt; - - current->journal_info = handle; - return handle; -} - - -/* Decrement the non-pointer handle value */ -static void ext4_put_nojournal(handle_t *handle) -{ - unsigned long ref_cnt = (unsigned long)handle; - - BUG_ON(ref_cnt == 0); - - ref_cnt--; - handle = (handle_t *)ref_cnt; - - current->journal_info = handle; -} - -/* - * Wrappers for jbd2_journal_start/end. - */ -handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) -{ - journal_t *journal; - - trace_ext4_journal_start(sb, nblocks, _RET_IP_); - if (sb->s_flags & MS_RDONLY) - return ERR_PTR(-EROFS); - - WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); - journal = EXT4_SB(sb)->s_journal; - if (!journal) - return ext4_get_nojournal(); - /* - * Special case here: if the journal has aborted behind our - * backs (eg. EIO in the commit thread), then we still need to - * take the FS itself readonly cleanly. - */ - if (is_journal_aborted(journal)) { - ext4_abort(sb, "Detected aborted journal"); - return ERR_PTR(-EROFS); - } - return jbd2_journal_start(journal, nblocks); -} - -int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) -{ - struct super_block *sb; - int err; - int rc; - - if (!ext4_handle_valid(handle)) { - ext4_put_nojournal(handle); - return 0; - } - sb = handle->h_transaction->t_journal->j_private; - err = handle->h_err; - rc = jbd2_journal_stop(handle); - - if (!err) - err = rc; - if (err) - __ext4_std_error(sb, where, line, err); - return err; -} - -void ext4_journal_abort_handle(const char *caller, unsigned int line, - const char *err_fn, struct buffer_head *bh, - handle_t *handle, int err) -{ - char nbuf[16]; - const char *errstr = ext4_decode_error(NULL, err, nbuf); - - BUG_ON(!ext4_handle_valid(handle)); - - if (bh) - BUFFER_TRACE(bh, "abort"); - - if (!handle->h_err) - handle->h_err = err; - - if (is_handle_aborted(handle)) - return; - - printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", - caller, line, errstr, err_fn); - - jbd2_journal_abort_handle(handle); -} - static void __save_error_info(struct super_block *sb, const char *func, unsigned int line) { @@ -553,7 +454,7 @@ void ext4_error_file(struct file *file, const char *function, va_list args; struct va_format vaf; struct ext4_super_block *es; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); char pathname[80], *path; es = EXT4_SB(inode->i_sb)->s_es; @@ -582,8 +483,8 @@ void ext4_error_file(struct file *file, const char *function, ext4_handle_error(inode->i_sb); } -static const char *ext4_decode_error(struct super_block *sb, int errno, - char nbuf[16]) +const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]) { char *errstr = NULL; @@ -858,6 +759,7 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, "Couldn't clean up the journal"); } + ext4_es_unregister_shrinker(sb); del_timer(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); @@ -885,6 +787,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_extent_cache_cnt); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -939,11 +842,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; - memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); + INIT_LIST_HEAD(&ei->i_es_lru); + ei->i_es_lru_nr = 0; ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; @@ -960,6 +864,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); + INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work); return &ei->vfs_inode; } @@ -1031,6 +936,7 @@ void ext4_clear_inode(struct inode *inode) dquot_drop(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + ext4_es_lru_del(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -1280,8 +1186,8 @@ static const match_table_t tokens = { {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, - {Opt_mblk_io_submit, "mblk_io_submit"}, - {Opt_nomblk_io_submit, "nomblk_io_submit"}, + {Opt_removed, "mblk_io_submit"}, + {Opt_removed, "nomblk_io_submit"}, {Opt_block_validity, "block_validity"}, {Opt_noblock_validity, "noblock_validity"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, @@ -1337,6 +1243,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) { struct ext4_sb_info *sbi = EXT4_SB(sb); char *qname; + int ret = -1; if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { @@ -1345,29 +1252,37 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "quota options when quota turned on"); return -1; } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { + ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options " + "when QUOTA feature is enabled"); + return -1; + } qname = match_strdup(args); if (!qname) { ext4_msg(sb, KERN_ERR, "Not enough memory for storing quotafile name"); return -1; } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext4_msg(sb, KERN_ERR, - "%s quota file already specified", QTYPE2NAME(qtype)); - kfree(qname); - return -1; + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 1; + else + ext4_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { + if (strchr(qname, '/')) { ext4_msg(sb, KERN_ERR, "quotafile must be on filesystem root"); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; - return -1; + goto errout; } + sbi->s_qf_names[qtype] = qname; set_opt(sb, QUOTA); return 1; +errout: + kfree(qname); + return ret; } static int clear_qf_name(struct super_block *sb, int qtype) @@ -1381,10 +1296,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) " when quota turned on"); return -1; } - /* - * The space will be released later when all options are confirmed - * to be correct - */ + kfree(sbi->s_qf_names[qtype]); sbi->s_qf_names[qtype] = NULL; return 1; } @@ -1404,6 +1316,9 @@ static int clear_qf_name(struct super_block *sb, int qtype) #define MOPT_QFMT MOPT_NOSUPPORT #endif #define MOPT_DATAJ 0x0080 +#define MOPT_NO_EXT2 0x0100 +#define MOPT_NO_EXT3 0x0200 +#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) static const struct mount_opts { int token; @@ -1414,25 +1329,31 @@ static const struct mount_opts { {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, - {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET}, - {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR}, {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, - {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET}, - {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR}, + {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, + MOPT_EXT4_ONLY | MOPT_SET}, + {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, + MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, - {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT}, - {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT}, - {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET}, + {Opt_delalloc, EXT4_MOUNT_DELALLOC, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, + {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, + MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT}, + {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, + MOPT_EXT4_ONLY | MOPT_SET}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | - EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET}, - {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET}, + EXT4_MOUNT_JOURNAL_CHECKSUM), + MOPT_EXT4_ONLY | MOPT_SET}, + {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, - {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET}, - {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR}, + {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, + MOPT_NO_EXT2 | MOPT_SET}, + {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, + MOPT_NO_EXT2 | MOPT_CLEAR}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, @@ -1444,9 +1365,14 @@ static const struct mount_opts { {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, {Opt_stripe, 0, MOPT_GTE0}, - {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, - {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, - {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, + {Opt_resuid, 0, MOPT_GTE0}, + {Opt_resgid, 0, MOPT_GTE0}, + {Opt_journal_dev, 0, MOPT_GTE0}, + {Opt_journal_ioprio, 0, MOPT_GTE0}, + {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, + {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, + {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, + MOPT_NO_EXT2 | MOPT_DATAJ}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, #ifdef CONFIG_EXT4_FS_POSIX_ACL @@ -1496,8 +1422,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, else if (token == Opt_offgrpjquota) return clear_qf_name(sb, GRPQUOTA); #endif - if (args->from && match_int(args, &arg)) - return -1; switch (token) { case Opt_noacl: case Opt_nouser_xattr: @@ -1506,138 +1430,156 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, case Opt_sb: return 1; /* handled by get_sb_block() */ case Opt_removed: - ext4_msg(sb, KERN_WARNING, - "Ignoring removed %s option", opt); + ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt); + return 1; + case Opt_abort: + sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + return 1; + case Opt_i_version: + sb->s_flags |= MS_I_VERSION; return 1; - case Opt_resuid: + } + + for (m = ext4_mount_opts; m->token != Opt_err; m++) + if (token == m->token) + break; + + if (m->token == Opt_err) { + ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " + "or missing value", opt); + return -1; + } + + if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { + ext4_msg(sb, KERN_ERR, + "Mount option \"%s\" incompatible with ext2", opt); + return -1; + } + if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { + ext4_msg(sb, KERN_ERR, + "Mount option \"%s\" incompatible with ext3", opt); + return -1; + } + + if (args->from && match_int(args, &arg)) + return -1; + if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) + return -1; + if (m->flags & MOPT_EXPLICIT) + set_opt2(sb, EXPLICIT_DELALLOC); + if (m->flags & MOPT_CLEAR_ERR) + clear_opt(sb, ERRORS_MASK); + if (token == Opt_noquota && sb_any_quota_loaded(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot change quota " + "options when quota turned on"); + return -1; + } + + if (m->flags & MOPT_NOSUPPORT) { + ext4_msg(sb, KERN_ERR, "%s option not supported", opt); + } else if (token == Opt_commit) { + if (arg == 0) + arg = JBD2_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * arg; + } else if (token == Opt_max_batch_time) { + if (arg == 0) + arg = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_max_batch_time = arg; + } else if (token == Opt_min_batch_time) { + sbi->s_min_batch_time = arg; + } else if (token == Opt_inode_readahead_blks) { + if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) { + ext4_msg(sb, KERN_ERR, + "EXT4-fs: inode_readahead_blks must be " + "0 or a power of 2 smaller than 2^31"); + return -1; + } + sbi->s_inode_readahead_blks = arg; + } else if (token == Opt_init_itable) { + set_opt(sb, INIT_INODE_TABLE); + if (!args->from) + arg = EXT4_DEF_LI_WAIT_MULT; + sbi->s_li_wait_mult = arg; + } else if (token == Opt_max_dir_size_kb) { + sbi->s_max_dir_size_kb = arg; + } else if (token == Opt_stripe) { + sbi->s_stripe = arg; + } else if (token == Opt_resuid) { uid = make_kuid(current_user_ns(), arg); if (!uid_valid(uid)) { ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg); return -1; } sbi->s_resuid = uid; - return 1; - case Opt_resgid: + } else if (token == Opt_resgid) { gid = make_kgid(current_user_ns(), arg); if (!gid_valid(gid)) { ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg); return -1; } sbi->s_resgid = gid; - return 1; - case Opt_abort: - sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; - return 1; - case Opt_i_version: - sb->s_flags |= MS_I_VERSION; - return 1; - case Opt_journal_dev: + } else if (token == Opt_journal_dev) { if (is_remount) { ext4_msg(sb, KERN_ERR, "Cannot specify journal on remount"); return -1; } *journal_devnum = arg; - return 1; - case Opt_journal_ioprio: - if (arg < 0 || arg > 7) - return -1; - *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); - return 1; - } - - for (m = ext4_mount_opts; m->token != Opt_err; m++) { - if (token != m->token) - continue; - if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) - return -1; - if (m->flags & MOPT_EXPLICIT) - set_opt2(sb, EXPLICIT_DELALLOC); - if (m->flags & MOPT_CLEAR_ERR) - clear_opt(sb, ERRORS_MASK); - if (token == Opt_noquota && sb_any_quota_loaded(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot change quota " - "options when quota turned on"); + } else if (token == Opt_journal_ioprio) { + if (arg > 7) { + ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" + " (must be 0-7)"); return -1; } - - if (m->flags & MOPT_NOSUPPORT) { - ext4_msg(sb, KERN_ERR, "%s option not supported", opt); - } else if (token == Opt_commit) { - if (arg == 0) - arg = JBD2_DEFAULT_MAX_COMMIT_AGE; - sbi->s_commit_interval = HZ * arg; - } else if (token == Opt_max_batch_time) { - if (arg == 0) - arg = EXT4_DEF_MAX_BATCH_TIME; - sbi->s_max_batch_time = arg; - } else if (token == Opt_min_batch_time) { - sbi->s_min_batch_time = arg; - } else if (token == Opt_inode_readahead_blks) { - if (arg > (1 << 30)) - return -1; - if (arg && !is_power_of_2(arg)) { + *journal_ioprio = + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); + } else if (m->flags & MOPT_DATAJ) { + if (is_remount) { + if (!sbi->s_journal) + ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); + else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) { ext4_msg(sb, KERN_ERR, - "EXT4-fs: inode_readahead_blks" - " must be a power of 2"); - return -1; - } - sbi->s_inode_readahead_blks = arg; - } else if (token == Opt_init_itable) { - set_opt(sb, INIT_INODE_TABLE); - if (!args->from) - arg = EXT4_DEF_LI_WAIT_MULT; - sbi->s_li_wait_mult = arg; - } else if (token == Opt_max_dir_size_kb) { - sbi->s_max_dir_size_kb = arg; - } else if (token == Opt_stripe) { - sbi->s_stripe = arg; - } else if (m->flags & MOPT_DATAJ) { - if (is_remount) { - if (!sbi->s_journal) - ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); - else if (test_opt(sb, DATA_FLAGS) != - m->mount_opt) { - ext4_msg(sb, KERN_ERR, "Cannot change data mode on remount"); - return -1; - } - } else { - clear_opt(sb, DATA_FLAGS); - sbi->s_mount_opt |= m->mount_opt; - } -#ifdef CONFIG_QUOTA - } else if (m->flags & MOPT_QFMT) { - if (sb_any_quota_loaded(sb) && - sbi->s_jquota_fmt != m->mount_opt) { - ext4_msg(sb, KERN_ERR, "Cannot " - "change journaled quota options " - "when quota turned on"); return -1; } - sbi->s_jquota_fmt = m->mount_opt; -#endif } else { - if (!args->from) - arg = 1; - if (m->flags & MOPT_CLEAR) - arg = !arg; - else if (unlikely(!(m->flags & MOPT_SET))) { - ext4_msg(sb, KERN_WARNING, - "buggy handling of option %s", opt); - WARN_ON(1); - return -1; - } - if (arg != 0) - sbi->s_mount_opt |= m->mount_opt; - else - sbi->s_mount_opt &= ~m->mount_opt; + clear_opt(sb, DATA_FLAGS); + sbi->s_mount_opt |= m->mount_opt; } - return 1; +#ifdef CONFIG_QUOTA + } else if (m->flags & MOPT_QFMT) { + if (sb_any_quota_loaded(sb) && + sbi->s_jquota_fmt != m->mount_opt) { + ext4_msg(sb, KERN_ERR, "Cannot change journaled " + "quota options when quota turned on"); + return -1; + } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_QUOTA)) { + ext4_msg(sb, KERN_ERR, + "Cannot set journaled quota options " + "when QUOTA feature is enabled"); + return -1; + } + sbi->s_jquota_fmt = m->mount_opt; +#endif + } else { + if (!args->from) + arg = 1; + if (m->flags & MOPT_CLEAR) + arg = !arg; + else if (unlikely(!(m->flags & MOPT_SET))) { + ext4_msg(sb, KERN_WARNING, + "buggy handling of option %s", opt); + WARN_ON(1); + return -1; + } + if (arg != 0) + sbi->s_mount_opt |= m->mount_opt; + else + sbi->s_mount_opt &= ~m->mount_opt; } - ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " - "or missing value", opt); - return -1; + return 1; } static int parse_options(char *options, struct super_block *sb, @@ -1667,6 +1609,12 @@ static int parse_options(char *options, struct super_block *sb, return 0; } #ifdef CONFIG_QUOTA + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && + (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { + ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA " + "feature is enabled"); + return 0; + } if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sb, USRQUOTA); @@ -1979,8 +1927,8 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group = ext4_flex_group(sbi, i); atomic_add(ext4_free_inodes_count(sb, gdp), &sbi->s_flex_groups[flex_group].free_inodes); - atomic_add(ext4_free_group_clusters(sb, gdp), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(ext4_free_group_clusters(sb, gdp), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &sbi->s_flex_groups[flex_group].used_dirs); } @@ -2776,7 +2724,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) break; } - if (group == ngroups) + if (group >= ngroups) ret = 1; if (!ret) { @@ -3016,33 +2964,34 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, return elr; } -static int ext4_register_li_request(struct super_block *sb, - ext4_group_t first_not_zeroed) +int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_li_request *elr; + struct ext4_li_request *elr = NULL; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; int ret = 0; + mutex_lock(&ext4_li_mtx); if (sbi->s_li_request != NULL) { /* * Reset timeout so it can be computed again, because * s_li_wait_mult might have changed. */ sbi->s_li_request->lr_timeout = 0; - return 0; + goto out; } if (first_not_zeroed == ngroups || (sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE)) - return 0; + goto out; elr = ext4_li_request_new(sb, first_not_zeroed); - if (!elr) - return -ENOMEM; - - mutex_lock(&ext4_li_mtx); + if (!elr) { + ret = -ENOMEM; + goto out; + } if (NULL == ext4_li_info) { ret = ext4_li_info_new(); @@ -3235,7 +3184,7 @@ int ext4_calculate_overhead(struct super_block *sb) } /* Add the journal blocks as well */ if (sbi->s_journal) - overhead += EXT4_B2C(sbi, sbi->s_journal->j_maxlen); + overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); sbi->s_overhead = overhead; smp_wmb(); @@ -3379,7 +3328,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif - set_opt(sb, MBLK_IO_SUBMIT); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) @@ -3763,6 +3711,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!err) { err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); } + if (!err) { + err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0); + } if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount3; @@ -3772,6 +3723,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_max_writeback_mb_bump = 128; sbi->s_extent_max_zeroout_kb = 32; + /* Register extent status tree shrinker */ + ext4_es_register_shrinker(sb); + /* * set up enough so that it can read an inode */ @@ -3783,13 +3737,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA - sb->s_qcop = &ext4_qctl_operations; sb->dq_op = &ext4_quota_operations; - - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { - /* Use qctl operations for hidden quota files. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) sb->s_qcop = &ext4_qctl_sysfile_operations; - } + else + sb->s_qcop = &ext4_qctl_operations; #endif memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); @@ -3985,6 +3937,16 @@ no_journal: if (err) goto failed_mount7; +#ifdef CONFIG_QUOTA + /* Enable quota usage during mount. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && + !(sb->s_flags & MS_RDONLY)) { + err = ext4_enable_quotas(sb); + if (err) + goto failed_mount8; + } +#endif /* CONFIG_QUOTA */ + EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; @@ -4002,16 +3964,6 @@ no_journal: } else descr = "out journal"; -#ifdef CONFIG_QUOTA - /* Enable quota usage during mount. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && - !(sb->s_flags & MS_RDONLY)) { - err = ext4_enable_quotas(sb); - if (err) - goto failed_mount7; - } -#endif /* CONFIG_QUOTA */ - if (test_opt(sb, DISCARD)) { struct request_queue *q = bdev_get_queue(sb->s_bdev); if (!blk_queue_discard(q)) @@ -4035,6 +3987,10 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; +#ifdef CONFIG_QUOTA +failed_mount8: + kobject_del(&sbi->s_kobj); +#endif failed_mount7: ext4_unregister_li_request(sb); failed_mount6: @@ -4061,6 +4017,7 @@ failed_mount3: percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: @@ -4476,16 +4433,12 @@ static void ext4_clear_journal_err(struct super_block *sb, int ext4_force_commit(struct super_block *sb) { journal_t *journal; - int ret = 0; if (sb->s_flags & MS_RDONLY) return 0; journal = EXT4_SB(sb)->s_journal; - if (journal) - ret = ext4_journal_force_commit(journal); - - return ret; + return ext4_journal_force_commit(journal); } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -4588,7 +4541,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err = 0; #ifdef CONFIG_QUOTA - int i; + int i, j; #endif char *orig_data = kstrdup(data, GFP_KERNEL); @@ -4604,7 +4557,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) - old_opts.s_qf_names[i] = sbi->s_qf_names[i]; + if (sbi->s_qf_names[i]) { + old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!old_opts.s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(old_opts.s_qf_names[j]); + kfree(orig_data); + return -ENOMEM; + } + } else + old_opts.s_qf_names[i] = NULL; #endif if (sbi->s_journal && sbi->s_journal->j_task->io_context) journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; @@ -4737,9 +4700,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - if (old_opts.s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(old_opts.s_qf_names[i]); + kfree(old_opts.s_qf_names[i]); if (enable_quota) { if (sb_any_quota_suspended(sb)) dquot_resume(sb, -1); @@ -4768,9 +4729,7 @@ restore_opts: #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(sbi->s_qf_names[i]); + kfree(sbi->s_qf_names[i]); sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif @@ -4835,7 +4794,7 @@ static int ext4_write_dquot(struct dquot *dquot) struct inode *inode; inode = dquot_to_inode(dquot); - handle = ext4_journal_start(inode, + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4851,7 +4810,7 @@ static int ext4_acquire_dquot(struct dquot *dquot) int ret, err; handle_t *handle; - handle = ext4_journal_start(dquot_to_inode(dquot), + handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4867,7 +4826,7 @@ static int ext4_release_dquot(struct dquot *dquot) int ret, err; handle_t *handle; - handle = ext4_journal_start(dquot_to_inode(dquot), + handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ @@ -4883,9 +4842,12 @@ static int ext4_release_dquot(struct dquot *dquot) static int ext4_mark_dquot_dirty(struct dquot *dquot) { + struct super_block *sb = dquot->dq_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + /* Are we journaling quotas? */ - if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || - EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) || + sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { dquot_mark_dquot_dirty(dquot); return ext4_write_dquot(dquot); } else { @@ -4899,7 +4861,7 @@ static int ext4_write_info(struct super_block *sb, int type) handle_t *handle; /* Data block + inode block */ - handle = ext4_journal_start(sb->s_root->d_inode, 2); + handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); @@ -5005,9 +4967,9 @@ static int ext4_enable_quotas(struct super_block *sb) DQUOT_USAGE_ENABLED); if (err) { ext4_warning(sb, - "Failed to enable quota (type=%d) " - "tracking. Please run e2fsck to fix.", - type); + "Failed to enable quota tracking " + "(type=%d, err=%d). Please run " + "e2fsck to fix.", type, err); return err; } } @@ -5045,7 +5007,7 @@ static int ext4_quota_off(struct super_block *sb, int type) /* Update modification times of quota files when userspace can * start looking at them */ - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) goto out; inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -5194,7 +5156,6 @@ static inline int ext2_feature_set_ok(struct super_block *sb) return 0; return 1; } -MODULE_ALIAS("ext2"); #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } @@ -5227,7 +5188,6 @@ static inline int ext3_feature_set_ok(struct super_block *sb) return 0; return 1; } -MODULE_ALIAS("ext3"); #else static inline void register_as_ext3(void) { } static inline void unregister_as_ext3(void) { } @@ -5241,6 +5201,7 @@ static struct file_system_type ext4_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext4"); static int __init ext4_init_feat_adverts(void) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3a91ebc2b66f..3a120b277240 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -549,7 +549,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, error = ext4_handle_dirty_xattr_block(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); - dquot_free_block(inode, 1); + dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); } @@ -832,7 +832,8 @@ inserted: else { /* The old block is released after updating the inode. */ - error = dquot_alloc_block(inode, 1); + error = dquot_alloc_block(inode, + EXT4_C2B(EXT4_SB(sb), 1)); if (error) goto cleanup; error = ext4_journal_get_write_access(handle, @@ -886,17 +887,18 @@ inserted: (unsigned long long)block); new_bh = sb_getblk(sb, block); - if (!new_bh) { + if (unlikely(!new_bh)) { + error = -ENOMEM; getblk_failed: ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA); - error = -EIO; goto cleanup; } lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, new_bh); if (error) { unlock_buffer(new_bh); + error = -EIO; goto getblk_failed; } memcpy(new_bh->b_data, s->base, new_bh->b_size); @@ -928,7 +930,7 @@ cleanup: return error; cleanup_dquot: - dquot_free_block(inode, 1); + dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); goto cleanup; bad_block: @@ -1164,17 +1166,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, { handle_t *handle; int error, retries = 0; - int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); + int credits = ext4_jbd2_credits_xattr(inode); retry: - /* - * In case of inline data, we may push out the data to a block, - * So reserve the journal space first. - */ - if (ext4_has_inline_data(inode)) - credits += ext4_writepage_trans_blocks(inode) + 1; - - handle = ext4_journal_start(inode, credits); + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 69eda787a96a..aa25deb5c6cd 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -125,74 +125,6 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); -extern int ext4_has_inline_data(struct inode *inode); -extern int ext4_get_inline_size(struct inode *inode); -extern int ext4_get_max_inline_size(struct inode *inode); -extern int ext4_find_inline_data_nolock(struct inode *inode); -extern void ext4_write_inline_data(struct inode *inode, - struct ext4_iloc *iloc, - void *buffer, loff_t pos, - unsigned int len); -extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); -extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); -extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); - -extern int ext4_readpage_inline(struct inode *inode, struct page *page); -extern int ext4_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - unsigned flags, - struct page **pagep); -extern int ext4_write_inline_data_end(struct inode *inode, - loff_t pos, unsigned len, - unsigned copied, - struct page *page); -extern struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, - unsigned len, - struct page *page); -extern int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata); -extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page); -extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); -extern int ext4_try_create_inline_dir(handle_t *handle, - struct inode *parent, - struct inode *inode); -extern int ext4_read_inline_dir(struct file *filp, - void *dirent, filldir_t filldir, - int *has_inline_data); -extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, - const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, - int *has_inline_data); -extern int ext4_delete_inline_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh, - int *has_inline_data); -extern int empty_inline_dir(struct inode *dir, int *has_inline_data); -extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, - struct ext4_dir_entry_2 **parent_de, - int *retval); -extern int ext4_inline_data_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - int *has_inline); -extern int ext4_try_to_evict_inline_data(handle_t *handle, - struct inode *inode, - int needed); -extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); - -extern int ext4_convert_inline_data(struct inode *inode); - #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ff3c8439af87..2b6fc131e2ce 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -72,22 +72,22 @@ static int f2fs_write_meta_page(struct page *page, { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int err; - wait_on_page_writeback(page); - - err = write_meta_page(sbi, page, wbc); - if (err) { + /* Should not write any meta pages, if any IO error was occurred */ + if (wbc->for_reclaim || + is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) { + dec_page_count(sbi, F2FS_DIRTY_META); wbc->pages_skipped++; set_page_dirty(page); + return AOP_WRITEPAGE_ACTIVATE; } - dec_page_count(sbi, F2FS_DIRTY_META); + wait_on_page_writeback(page); - /* In this case, we should not unlock this page */ - if (err != AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); - return err; + write_meta_page(sbi, page); + dec_page_count(sbi, F2FS_DIRTY_META); + unlock_page(page); + return 0; } static int f2fs_write_meta_pages(struct address_space *mapping, @@ -138,7 +138,10 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, BUG_ON(page->mapping != mapping); BUG_ON(!PageDirty(page)); clear_page_dirty_for_io(page); - f2fs_write_meta_page(page, &wbc); + if (f2fs_write_meta_page(page, &wbc)) { + unlock_page(page); + break; + } if (nwritten++ >= nr_to_write) break; } @@ -161,7 +164,6 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(sbi, F2FS_DIRTY_META); - F2FS_SET_SB_DIRT(sbi); return 1; } return 0; @@ -216,19 +218,11 @@ retry: new->ino = ino; /* add new_oentry into list which is sorted by inode number */ - if (orphan) { - struct orphan_inode_entry *prev; - - /* get previous entry */ - prev = list_entry(orphan->list.prev, typeof(*prev), list); - if (&prev->list != head) - /* insert new orphan inode entry */ - list_add(&new->list, &prev->list); - else - list_add(&new->list, head); - } else { + if (orphan) + list_add(&new->list, this->prev); + else list_add_tail(&new->list, head); - } + sbi->n_orphans++; out: mutex_unlock(&sbi->orphan_inode_mutex); @@ -545,7 +539,7 @@ retry: /* * Freeze all the FS-operations for checkpoint. */ -void block_operations(struct f2fs_sb_info *sbi) +static void block_operations(struct f2fs_sb_info *sbi) { int t; struct writeback_control wbc = { @@ -717,27 +711,24 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) sbi->alloc_valid_block_count = 0; /* Here, we only have one bio having CP pack */ - if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) - sbi->sb->s_flags |= MS_RDONLY; - else - sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + sync_meta_pages(sbi, META_FLUSH, LONG_MAX); - clear_prefree_segments(sbi); - F2FS_RESET_SB_DIRT(sbi); + if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + clear_prefree_segments(sbi); + F2FS_RESET_SB_DIRT(sbi); + } } /* * We guarantee that this checkpoint procedure should not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, bool blocked, bool is_umount) +void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; - if (!blocked) { - mutex_lock(&sbi->cp_mutex); - block_operations(sbi); - } + mutex_lock(&sbi->cp_mutex); + block_operations(sbi); f2fs_submit_bio(sbi, DATA, true); f2fs_submit_bio(sbi, NODE, true); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index c8c37307b326..025b9e2f935d 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -183,10 +183,12 @@ static int stat_show(struct seq_file *s, void *v) mutex_lock(&f2fs_stat_mutex); list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { + char devname[BDEVNAME_SIZE]; update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++); + seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n", + bdevname(si->sbi->sb->s_bdev, devname), i++); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 989980e16d0b..a1f38443ecee 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -265,7 +265,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, mutex_unlock_op(sbi, DENTRY_OPS); } -void init_dent_inode(struct dentry *dentry, struct page *ipage) +void init_dent_inode(const struct qstr *name, struct page *ipage) { struct f2fs_node *rn; @@ -274,20 +274,19 @@ void init_dent_inode(struct dentry *dentry, struct page *ipage) wait_on_page_writeback(ipage); - /* copy dentry info. to this inode page */ + /* copy name info. to this inode page */ rn = (struct f2fs_node *)page_address(ipage); - rn->i.i_namelen = cpu_to_le32(dentry->d_name.len); - memcpy(rn->i.i_name, dentry->d_name.name, dentry->d_name.len); + rn->i.i_namelen = cpu_to_le32(name->len); + memcpy(rn->i.i_name, name->name, name->len); set_page_dirty(ipage); } -static int init_inode_metadata(struct inode *inode, struct dentry *dentry) +static int init_inode_metadata(struct inode *inode, + struct inode *dir, const struct qstr *name) { - struct inode *dir = dentry->d_parent->d_inode; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { int err; - err = new_inode_page(inode, dentry); + err = new_inode_page(inode, name); if (err) return err; @@ -310,7 +309,7 @@ static int init_inode_metadata(struct inode *inode, struct dentry *dentry) if (IS_ERR(ipage)) return PTR_ERR(ipage); set_cold_node(inode, ipage); - init_dent_inode(dentry, ipage); + init_dent_inode(name, ipage); f2fs_put_page(ipage, 1); } if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { @@ -371,7 +370,7 @@ next: goto next; } -int f2fs_add_link(struct dentry *dentry, struct inode *inode) +int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) { unsigned int bit_pos; unsigned int level; @@ -380,17 +379,15 @@ int f2fs_add_link(struct dentry *dentry, struct inode *inode) f2fs_hash_t dentry_hash; struct f2fs_dir_entry *de; unsigned int nbucket, nblock; - struct inode *dir = dentry->d_parent->d_inode; struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - const char *name = dentry->d_name.name; - size_t namelen = dentry->d_name.len; + size_t namelen = name->len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; int slots = GET_DENTRY_SLOTS(namelen); int err = 0; int i; - dentry_hash = f2fs_dentry_hash(name, dentry->d_name.len); + dentry_hash = f2fs_dentry_hash(name->name, name->len); level = 0; current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -433,7 +430,7 @@ start: ++level; goto start; add_dentry: - err = init_inode_metadata(inode, dentry); + err = init_inode_metadata(inode, dir, name); if (err) goto fail; @@ -442,7 +439,7 @@ add_dentry: de = &dentry_blk->dentry[bit_pos]; de->hash_code = dentry_hash; de->name_len = cpu_to_le16(namelen); - memcpy(dentry_blk->filename[bit_pos], name, namelen); + memcpy(dentry_blk->filename[bit_pos], name->name, name->len); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode); for (i = 0; i < slots; i++) @@ -603,7 +600,7 @@ bool f2fs_empty_dir(struct inode *dir) static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir) { unsigned long pos = file->f_pos; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); unsigned long npages = dir_blocks(inode); unsigned char *types = NULL; unsigned int bit_pos = 0, start_bit_pos = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c8e2d751ef9c..cc2213afdcc7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -104,6 +104,20 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) } /* + * ioctl commands + */ +#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS +#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#endif + +/* * For INODE and NODE manager */ #define XATTR_NODE_OFFSET (-1) /* @@ -141,7 +155,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ - unsigned long long data_version;/* lastes version of data for fsync */ + unsigned long long data_version;/* latest version of data for fsync */ atomic_t dirty_dents; /* # of dirty dentry pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ @@ -573,6 +587,14 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } +static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) +{ + unsigned int pages_per_sec = sbi->segs_per_sec * + (1 << sbi->log_blocks_per_seg); + return ((get_pages(sbi, block_type) + pages_per_sec - 1) + >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; +} + static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) { block_t ret; @@ -842,12 +864,12 @@ void f2fs_truncate(struct inode *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); long f2fs_ioctl(struct file *, unsigned int, unsigned long); +long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); /* * inode.c */ void f2fs_set_inode_flags(struct inode *); -struct inode *f2fs_iget_nowait(struct super_block *, unsigned long); struct inode *f2fs_iget(struct super_block *, unsigned long); void update_inode(struct inode *, struct page *); int f2fs_write_inode(struct inode *, struct writeback_control *); @@ -867,12 +889,18 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); ino_t f2fs_inode_by_name(struct inode *, struct qstr *); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); -void init_dent_inode(struct dentry *, struct page *); -int f2fs_add_link(struct dentry *, struct inode *); +void init_dent_inode(const struct qstr *, struct page *); +int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); int f2fs_make_empty(struct inode *, struct inode *); bool f2fs_empty_dir(struct inode *); +static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) +{ + return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name, + inode); +} + /* * super.c */ @@ -896,7 +924,7 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int remove_inode_page(struct inode *); -int new_inode_page(struct inode *, struct dentry *); +int new_inode_page(struct inode *, const struct qstr *); struct page *new_node_page(struct dnode_of_data *, unsigned int); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); @@ -929,8 +957,7 @@ void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); struct bio *f2fs_bio_alloc(struct block_device *, int); void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync); -int write_meta_page(struct f2fs_sb_info *, struct page *, - struct writeback_control *); +void write_meta_page(struct f2fs_sb_info *, struct page *); void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, block_t, block_t *); void write_data_page(struct inode *, struct page *, struct dnode_of_data*, @@ -963,8 +990,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *); void set_dirty_dir_page(struct inode *, struct page *); void remove_dirty_dir_inode(struct inode *); void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void block_operations(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, bool, bool); +void write_checkpoint(struct f2fs_sb_info *, bool); void init_orphan_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3191b52aafb0..958a46da19ae 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -15,6 +15,7 @@ #include <linux/writeback.h> #include <linux/falloc.h> #include <linux/types.h> +#include <linux/compat.h> #include <linux/uaccess.h> #include <linux/mount.h> @@ -28,7 +29,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); block_t old_blk_addr; struct dnode_of_data dn; @@ -157,11 +158,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP)) + else if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP)) need_cp = true; - if (!space_for_roll_forward(sbi)) + else if (!space_for_roll_forward(sbi)) need_cp = true; - if (need_to_sync_dir(sbi, inode)) + else if (need_to_sync_dir(sbi, inode)) need_cp = true; if (need_cp) { @@ -298,8 +299,6 @@ void f2fs_truncate(struct inode *inode) inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } - - f2fs_balance_fs(F2FS_SB(inode->i_sb)); } static int f2fs_getattr(struct vfsmount *mnt, @@ -356,6 +355,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_size != i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); f2fs_truncate(inode); + f2fs_balance_fs(F2FS_SB(inode->i_sb)); } __setattr_copy(inode, attr); @@ -387,12 +387,17 @@ const struct inode_operations f2fs_file_inode_operations = { static void fill_zero(struct inode *inode, pgoff_t index, loff_t start, loff_t len) { + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; if (!len) return; + f2fs_balance_fs(sbi); + + mutex_lock_op(sbi, DATA_NEW); page = get_new_data_page(inode, index, false); + mutex_unlock_op(sbi, DATA_NEW); if (!IS_ERR(page)) { wait_on_page_writeback(page); @@ -539,7 +544,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, static long f2fs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); long ret; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -572,7 +577,7 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int flags; int ret; @@ -630,6 +635,23 @@ out: } } +#ifdef CONFIG_COMPAT +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case F2FS_IOC32_GETFLAGS: + cmd = F2FS_IOC_GETFLAGS; + break; + case F2FS_IOC32_SETFLAGS: + cmd = F2FS_IOC_SETFLAGS; + break; + default: + return -ENOIOCTLCMD; + } + return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + const struct file_operations f2fs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -641,6 +663,9 @@ const struct file_operations f2fs_file_operations = { .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = f2fs_compat_ioctl, +#endif .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index c386910dacc5..94b8a0c48453 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -44,10 +44,10 @@ static int gc_thread_func(void *data) if (kthread_should_stop()) break; - f2fs_balance_fs(sbi); - - if (!test_opt(sbi, BG_GC)) + if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { + wait_ms = GC_THREAD_MAX_SLEEP_TIME; continue; + } /* * [GC triggering condition] @@ -78,7 +78,8 @@ static int gc_thread_func(void *data) sbi->bg_gc++; - if (f2fs_gc(sbi) == GC_NONE) + /* if return value is not zero, no victim was selected */ + if (f2fs_gc(sbi)) wait_ms = GC_THREAD_NOGC_SLEEP_TIME; else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) wait_ms = GC_THREAD_MAX_SLEEP_TIME; @@ -90,7 +91,10 @@ static int gc_thread_func(void *data) int start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; + dev_t dev = sbi->sb->s_bdev->bd_dev; + if (!test_opt(sbi, BG_GC)) + return 0; gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) return -ENOMEM; @@ -98,9 +102,10 @@ int start_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, - GC_THREAD_NAME); + "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { kfree(gc_th); + sbi->gc_thread = NULL; return -ENOMEM; } return 0; @@ -141,6 +146,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, static unsigned int get_max_cost(struct f2fs_sb_info *sbi, struct victim_sel_policy *p) { + /* SSR allocates in a segment unit */ + if (p->alloc_mode == SSR) + return 1 << sbi->log_blocks_per_seg; if (p->gc_mode == GC_GREEDY) return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; else if (p->gc_mode == GC_CB) @@ -356,7 +364,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi, sentry = get_seg_entry(sbi, segno); ret = f2fs_test_bit(offset, sentry->cur_valid_map); mutex_unlock(&sit_i->sentry_lock); - return ret ? GC_OK : GC_NEXT; + return ret; } /* @@ -364,7 +372,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ -static int gc_node_segment(struct f2fs_sb_info *sbi, +static void gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { bool initial = true; @@ -376,21 +384,12 @@ next_step: for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; - int err; - /* - * It makes sure that free segments are able to write - * all the dirty node pages before CP after this CP. - * So let's check the space of dirty node pages. - */ - if (should_do_checkpoint(sbi)) { - mutex_lock(&sbi->cp_mutex); - block_operations(sbi); - return GC_BLOCKED; - } + /* stop BG_GC if there is not enough free sections. */ + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) + return; - err = check_valid_map(sbi, segno, off); - if (err == GC_NEXT) + if (check_valid_map(sbi, segno, off) == 0) continue; if (initial) { @@ -420,7 +419,6 @@ next_step: }; sync_node_pages(sbi, 0, &wbc); } - return GC_DONE; } /* @@ -463,13 +461,13 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) - return GC_NEXT; + return 0; get_node_info(sbi, nid, dni); if (sum->version != dni->version) { f2fs_put_page(node_page, 1); - return GC_NEXT; + return 0; } *nofs = ofs_of_node(node_page); @@ -477,8 +475,8 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) - return GC_NEXT; - return GC_OK; + return 0; + return 1; } static void move_data_page(struct inode *inode, struct page *page, int gc_type) @@ -519,13 +517,13 @@ out: * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ -static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct list_head *ilist, unsigned int segno, int gc_type) { struct super_block *sb = sbi->sb; struct f2fs_summary *entry; block_t start_addr; - int err, off; + int off; int phase = 0; start_addr = START_BLOCK(sbi, segno); @@ -539,20 +537,11 @@ next_step: unsigned int ofs_in_node, nofs; block_t start_bidx; - /* - * It makes sure that free segments are able to write - * all the dirty node pages before CP after this CP. - * So let's check the space of dirty node pages. - */ - if (should_do_checkpoint(sbi)) { - mutex_lock(&sbi->cp_mutex); - block_operations(sbi); - err = GC_BLOCKED; - goto stop; - } + /* stop BG_GC if there is not enough free sections. */ + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) + return; - err = check_valid_map(sbi, segno, off); - if (err == GC_NEXT) + if (check_valid_map(sbi, segno, off) == 0) continue; if (phase == 0) { @@ -561,8 +550,7 @@ next_step: } /* Get an inode by ino with checking validity */ - err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs); - if (err == GC_NEXT) + if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0) continue; if (phase == 1) { @@ -574,7 +562,7 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 2) { - inode = f2fs_iget_nowait(sb, dni.ino); + inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode)) continue; @@ -602,11 +590,9 @@ next_iput: } if (++phase < 4) goto next_step; - err = GC_DONE; -stop: + if (gc_type == FG_GC) f2fs_submit_bio(sbi, DATA, true); - return err; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -620,17 +606,16 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, return ret; } -static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, +static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, struct list_head *ilist, int gc_type) { struct page *sum_page; struct f2fs_summary_block *sum; - int ret = GC_DONE; /* read segment summary of victim */ sum_page = get_sum_page(sbi, segno); if (IS_ERR(sum_page)) - return GC_ERROR; + return; /* * CP needs to lock sum_page. In this time, we don't need @@ -642,17 +627,16 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, switch (GET_SUM_TYPE((&sum->footer))) { case SUM_TYPE_NODE: - ret = gc_node_segment(sbi, sum->entries, segno, gc_type); + gc_node_segment(sbi, sum->entries, segno, gc_type); break; case SUM_TYPE_DATA: - ret = gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); + gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); break; } stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); stat_inc_call_count(sbi->stat_info); f2fs_put_page(sum_page, 0); - return ret; } int f2fs_gc(struct f2fs_sb_info *sbi) @@ -660,40 +644,38 @@ int f2fs_gc(struct f2fs_sb_info *sbi) struct list_head ilist; unsigned int segno, i; int gc_type = BG_GC; - int gc_status = GC_NONE; + int nfree = 0; + int ret = -1; INIT_LIST_HEAD(&ilist); gc_more: if (!(sbi->sb->s_flags & MS_ACTIVE)) goto stop; - if (has_not_enough_free_secs(sbi)) + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) gc_type = FG_GC; if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) goto stop; + ret = 0; - for (i = 0; i < sbi->segs_per_sec; i++) { - /* - * do_garbage_collect will give us three gc_status: - * GC_ERROR, GC_DONE, and GC_BLOCKED. - * If GC is finished uncleanly, we have to return - * the victim to dirty segment list. - */ - gc_status = do_garbage_collect(sbi, segno + i, &ilist, gc_type); - if (gc_status != GC_DONE) - break; - } - if (has_not_enough_free_secs(sbi)) { - write_checkpoint(sbi, (gc_status == GC_BLOCKED), false); - if (has_not_enough_free_secs(sbi)) - goto gc_more; - } + for (i = 0; i < sbi->segs_per_sec; i++) + do_garbage_collect(sbi, segno + i, &ilist, gc_type); + + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + nfree++; + + if (has_not_enough_free_secs(sbi, nfree)) + goto gc_more; + + if (gc_type == FG_GC) + write_checkpoint(sbi, false); stop: mutex_unlock(&sbi->gc_mutex); put_gc_inode(&ilist); - return gc_status; + return ret; } void build_gc_manager(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b026d9354ccd..30b2db003acd 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -8,7 +8,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#define GC_THREAD_NAME "f2fs_gc_task" #define GC_THREAD_MIN_WB_PAGES 1 /* * a threshold to determine * whether IO subsystem is idle @@ -23,15 +22,6 @@ /* Search max. number of dirty segments to select a victim segment */ #define MAX_VICTIM_SEARCH 20 -enum { - GC_NONE = 0, - GC_ERROR, - GC_OK, - GC_NEXT, - GC_BLOCKED, - GC_DONE, -}; - struct f2fs_gc_kthread { struct task_struct *f2fs_gc_task; wait_queue_head_t gc_wait_queue_head; @@ -104,14 +94,3 @@ static inline int is_idle(struct f2fs_sb_info *sbi) struct request_list *rl = &q->root_rl; return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); } - -static inline bool should_do_checkpoint(struct f2fs_sb_info *sbi) -{ - unsigned int pages_per_sec = sbi->segs_per_sec * - (1 << sbi->log_blocks_per_seg); - int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; - int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; - return free_sections(sbi) <= (node_secs + 2 * dent_secs + 2); -} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 794241777322..ddae412d30c8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -16,11 +16,6 @@ #include "f2fs.h" #include "node.h" -struct f2fs_iget_args { - u64 ino; - int on_free; -}; - void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; @@ -40,34 +35,6 @@ void f2fs_set_inode_flags(struct inode *inode) inode->i_flags |= S_DIRSYNC; } -static int f2fs_iget_test(struct inode *inode, void *data) -{ - struct f2fs_iget_args *args = data; - - if (inode->i_ino != args->ino) - return 0; - if (inode->i_state & (I_FREEING | I_WILL_FREE)) { - args->on_free = 1; - return 0; - } - return 1; -} - -struct inode *f2fs_iget_nowait(struct super_block *sb, unsigned long ino) -{ - struct f2fs_iget_args args = { - .ino = ino, - .on_free = 0 - }; - struct inode *inode = ilookup5(sb, ino, f2fs_iget_test, &args); - - if (inode) - return inode; - if (!args.on_free) - return f2fs_iget(sb, ino); - return ERR_PTR(-ENOENT); -} - static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -100,6 +67,10 @@ static int do_read_inode(struct inode *inode) inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); inode->i_generation = le32_to_cpu(ri->i_generation); + if (ri->i_addr[0]) + inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0])); + else + inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1])); fi->i_current_depth = le32_to_cpu(ri->i_current_depth); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); @@ -203,6 +174,20 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + ri->i_addr[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + ri->i_addr[1] = 0; + } else { + ri->i_addr[0] = 0; + ri->i_addr[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + ri->i_addr[2] = 0; + } + } + set_cold_node(inode, node_page); set_page_dirty(node_page); } @@ -260,6 +245,7 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; + sb_start_intwrite(inode->i_sb); set_inode_flag(F2FS_I(inode), FI_NO_ALLOC); i_size_write(inode, 0); @@ -267,6 +253,7 @@ void f2fs_evict_inode(struct inode *inode) f2fs_truncate(inode); remove_inode_page(inode); + sb_end_intwrite(inode->i_sb); no_delete: clear_inode(inode); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9bda63c9c166..e275218904ed 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -104,7 +104,7 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) f2fs_put_page(page, 1); continue; } - page_cache_release(page); + f2fs_put_page(page, 0); } } @@ -660,7 +660,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int err = 0, cont = 1; int level, offset[4], noffset[4]; - unsigned int nofs; + unsigned int nofs = 0; struct f2fs_node *rn; struct dnode_of_data dn; struct page *page; @@ -780,7 +780,7 @@ int remove_inode_page(struct inode *inode) return 0; } -int new_inode_page(struct inode *inode, struct dentry *dentry) +int new_inode_page(struct inode *inode, const struct qstr *name) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; @@ -790,7 +790,7 @@ int new_inode_page(struct inode *inode, struct dentry *dentry) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); mutex_lock_op(sbi, NODE_NEW); page = new_node_page(&dn, 0); - init_dent_inode(dentry, page); + init_dent_inode(name, page); mutex_unlock_op(sbi, NODE_NEW); if (IS_ERR(page)) return PTR_ERR(page); @@ -874,15 +874,11 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) return; if (read_node_page(apage, READA)) - goto unlock_out; + unlock_page(apage); - page_cache_release(apage); - return; - -unlock_out: - unlock_page(apage); release_out: - page_cache_release(apage); + f2fs_put_page(apage, 0); + return; } struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) @@ -1139,7 +1135,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, /* First check balancing cached NAT entries */ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { - write_checkpoint(sbi, false, false); + write_checkpoint(sbi, false); return 0; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index f42e4060b399..b235215ac138 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -42,7 +42,7 @@ static int recover_dentry(struct page *ipage, struct inode *inode) { struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage); struct f2fs_inode *raw_inode = &(raw_node->i); - struct dentry dent, parent; + struct qstr name; struct f2fs_dir_entry *de; struct page *page; struct inode *dir; @@ -57,17 +57,15 @@ static int recover_dentry(struct page *ipage, struct inode *inode) goto out; } - parent.d_inode = dir; - dent.d_parent = &parent; - dent.d_name.len = le32_to_cpu(raw_inode->i_namelen); - dent.d_name.name = raw_inode->i_name; + name.len = le32_to_cpu(raw_inode->i_namelen); + name.name = raw_inode->i_name; - de = f2fs_find_entry(dir, &dent.d_name, &page); + de = f2fs_find_entry(dir, &name, &page); if (de) { kunmap(page); f2fs_put_page(page, 0); } else { - err = f2fs_add_link(&dent, inode); + err = __f2fs_add_link(dir, &name, inode); } iput(dir); out: @@ -226,7 +224,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, f2fs_put_page(node_page, 1); /* Deallocate previous index in the node page */ - inode = f2fs_iget_nowait(sbi->sb, ino); + inode = f2fs_iget(sbi->sb, ino); if (IS_ERR(inode)) return; @@ -373,5 +371,5 @@ void recover_fsync_data(struct f2fs_sb_info *sbi) out: destroy_fsync_dnodes(sbi, &inode_list); kmem_cache_destroy(fsync_entry_slab); - write_checkpoint(sbi, false, false); + write_checkpoint(sbi, false); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4b0099066582..777f17e496e6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -29,7 +29,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ - if (has_not_enough_free_secs(sbi)) { + if (has_not_enough_free_secs(sbi, 0)) { mutex_lock(&sbi->gc_mutex); f2fs_gc(sbi); } @@ -308,7 +308,7 @@ static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, * If there is not enough reserved sections, * we should not reuse prefree segments. */ - if (has_not_enough_free_secs(sbi)) + if (has_not_enough_free_secs(sbi, 0)) return NULL_SEGNO; /* @@ -536,6 +536,23 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) } } +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + + if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0)) + return v_ops->get_victim(sbi, + &(curseg)->next_segno, BG_GC, type, SSR); + + /* For data segments, let's do SSR more intensively */ + for (; type >= CURSEG_HOT_DATA; type--) + if (v_ops->get_victim(sbi, &(curseg)->next_segno, + BG_GC, type, SSR)) + return 1; + return 0; +} + /* * flush out current segment and replace it with new segment * This function should be returned with success, otherwise BUG @@ -600,6 +617,7 @@ static void f2fs_end_io_write(struct bio *bio, int err) if (page->mapping) set_bit(AS_EIO, &page->mapping->flags); set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG); + p->sbi->sb->s_flags |= MS_RDONLY; } end_page_writeback(page); dec_page_count(p->sbi, F2FS_WRITEBACK); @@ -815,15 +833,10 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); } -int write_meta_page(struct f2fs_sb_info *sbi, struct page *page, - struct writeback_control *wbc) +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) { - if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; - set_page_writeback(page); submit_write_page(sbi, page, page->index, META); - return 0; } void write_node_page(struct f2fs_sb_info *sbi, struct page *page, diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 66a288a52fd3..552dadbb2327 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -450,29 +450,16 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) return (free_sections(sbi) < overprovision_sections(sbi)); } -static inline int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) { - struct curseg_info *curseg = CURSEG_I(sbi, type); - return DIRTY_I(sbi)->v_ops->get_victim(sbi, - &(curseg)->next_segno, BG_GC, type, SSR); -} - -static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi) -{ - unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) * - sbi->segs_per_sec; - int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; - int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); if (sbi->por_doing) return false; - if (free_sections(sbi) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi))) - return true; - return false; + return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi))); } static inline int utilization(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 37fad04c8669..fea6e582a2ed 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -112,7 +112,7 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_stats(sbi); stop_gc_thread(sbi); - write_checkpoint(sbi, false, true); + write_checkpoint(sbi, true); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -136,13 +136,29 @@ int f2fs_sync_fs(struct super_block *sb, int sync) return 0; if (sync) - write_checkpoint(sbi, false, false); + write_checkpoint(sbi, false); else f2fs_balance_fs(sbi); return 0; } +static int f2fs_freeze(struct super_block *sb) +{ + int err; + + if (sb->s_flags & MS_RDONLY) + return 0; + + err = f2fs_sync_fs(sb, 1); + return err; +} + +static int f2fs_unfreeze(struct super_block *sb) +{ + return 0; +} + static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -198,7 +214,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noacl"); #endif if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) - seq_puts(seq, ",disable_ext_indentify"); + seq_puts(seq, ",disable_ext_identify"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); @@ -213,6 +229,8 @@ static struct super_operations f2fs_sops = { .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, .sync_fs = f2fs_sync_fs, + .freeze_fs = f2fs_freeze, + .unfreeze_fs = f2fs_unfreeze, .statfs = f2fs_statfs, }; @@ -366,14 +384,23 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } + /* Currently, support only 4KB page cache size */ + if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid page_cache_size (%lu), supports only 4KB\n", + PAGE_CACHE_SIZE); + return 1; + } + /* Currently, support only 4KB block size */ blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); - if (blocksize != PAGE_CACHE_SIZE) { + if (blocksize != F2FS_BLKSIZE) { f2fs_msg(sb, KERN_INFO, "Invalid blocksize (%u), supports only 4KB\n", blocksize); return 1; } + if (le32_to_cpu(raw_super->log_sectorsize) != F2FS_LOG_SECTOR_SIZE) { f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); @@ -387,10 +414,11 @@ static int sanity_check_raw_super(struct super_block *sb, return 0; } -static int sanity_check_ckpt(struct f2fs_super_block *raw_super, - struct f2fs_checkpoint *ckpt) +static int sanity_check_ckpt(struct f2fs_sb_info *sbi) { unsigned int total, fsmeta; + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -401,6 +429,11 @@ static int sanity_check_ckpt(struct f2fs_super_block *raw_super, if (fsmeta >= total) return 1; + + if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); + return 1; + } return 0; } @@ -429,6 +462,32 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); } +static int validate_superblock(struct super_block *sb, + struct f2fs_super_block **raw_super, + struct buffer_head **raw_super_buf, sector_t block) +{ + const char *super = (block == 0 ? "first" : "second"); + + /* read f2fs raw super block */ + *raw_super_buf = sb_bread(sb, block); + if (!*raw_super_buf) { + f2fs_msg(sb, KERN_ERR, "unable to read %s superblock", + super); + return 1; + } + + *raw_super = (struct f2fs_super_block *) + ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET); + + /* sanity checking of raw super */ + if (!sanity_check_raw_super(sb, *raw_super)) + return 0; + + f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " + "in %s superblock", super); + return 1; +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -449,16 +508,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - /* read f2fs raw super block */ - raw_super_buf = sb_bread(sb, 0); - if (!raw_super_buf) { - err = -EIO; - f2fs_msg(sb, KERN_ERR, "unable to read superblock"); - goto free_sbi; + if (validate_superblock(sb, &raw_super, &raw_super_buf, 0)) { + brelse(raw_super_buf); + if (validate_superblock(sb, &raw_super, &raw_super_buf, 1)) + goto free_sb_buf; } - raw_super = (struct f2fs_super_block *) - ((char *)raw_super_buf->b_data + F2FS_SUPER_OFFSET); - /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; @@ -474,12 +528,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (parse_options(sb, sbi, (char *)data)) goto free_sb_buf; - /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, raw_super)) { - f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem"); - goto free_sb_buf; - } - sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); @@ -525,7 +573,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* sanity checking of checkpoint */ err = -EINVAL; - if (sanity_check_ckpt(raw_super, sbi->ckpt)) { + if (sanity_check_ckpt(sbi)) { f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); goto free_cp; } @@ -639,6 +687,7 @@ static struct file_system_type f2fs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 58bf744dbf39..165012ef363a 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -698,7 +698,7 @@ out: static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); return __fat_readdir(inode, filp, dirent, filldir, 0, 0); } @@ -779,7 +779,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp, static long fat_dir_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; int short_only, both; @@ -819,7 +819,7 @@ FAT_IOCTL_FILLDIR_FUNC(fat_compat_ioctl_filldir, compat_dirent) static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, unsigned long arg) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct compat_dirent __user *d1 = compat_ptr(arg); int short_only, both; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 12701a567752..e9cc3f0d58e2 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -95,6 +95,8 @@ struct msdos_sb_info { spinlock_t dir_hash_lock; struct hlist_head dir_hashtable[FAT_HASH_SIZE]; + + unsigned int dirty; /* fs state before mount */ }; #define FAT_CACHE_VALID 0 /* special case for valid cache */ diff --git a/fs/fat/file.c b/fs/fat/file.c index a62e0ecbe2db..3978f8ca1823 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -32,7 +32,7 @@ static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int is_dir = S_ISDIR(inode->i_mode); u32 attr, oldattr; @@ -116,7 +116,7 @@ out: long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); u32 __user *user_attr = (u32 __user *)arg; switch (cmd) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index f8f491677a4a..acf6e479b443 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -341,12 +341,11 @@ struct inode *fat_iget(struct super_block *sb, loff_t i_pos) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos); - struct hlist_node *_p; struct msdos_inode_info *i; struct inode *inode = NULL; spin_lock(&sbi->inode_hash_lock); - hlist_for_each_entry(i, _p, head, i_fat_hash) { + hlist_for_each_entry(i, head, i_fat_hash) { BUG_ON(i->vfs_inode.i_sb != sb); if (i->i_pos != i_pos) continue; @@ -488,10 +487,59 @@ static void fat_evict_inode(struct inode *inode) fat_detach(inode); } +static void fat_set_state(struct super_block *sb, + unsigned int set, unsigned int force) +{ + struct buffer_head *bh; + struct fat_boot_sector *b; + struct msdos_sb_info *sbi = sb->s_fs_info; + + /* do not change any thing if mounted read only */ + if ((sb->s_flags & MS_RDONLY) && !force) + return; + + /* do not change state if fs was dirty */ + if (sbi->dirty) { + /* warn only on set (mount). */ + if (set) + fat_msg(sb, KERN_WARNING, "Volume was not properly " + "unmounted. Some data may be corrupt. " + "Please run fsck."); + return; + } + + bh = sb_bread(sb, 0); + if (bh == NULL) { + fat_msg(sb, KERN_ERR, "unable to read boot sector " + "to mark fs as dirty"); + return; + } + + b = (struct fat_boot_sector *) bh->b_data; + + if (sbi->fat_bits == 32) { + if (set) + b->fat32.state |= FAT_STATE_DIRTY; + else + b->fat32.state &= ~FAT_STATE_DIRTY; + } else /* fat 16 and 12 */ { + if (set) + b->fat16.state |= FAT_STATE_DIRTY; + else + b->fat16.state &= ~FAT_STATE_DIRTY; + } + + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); +} + static void fat_put_super(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); + fat_set_state(sb, 0, 0); + iput(sbi->fsinfo_inode); iput(sbi->fat_inode); @@ -566,8 +614,18 @@ static void __exit fat_destroy_inodecache(void) static int fat_remount(struct super_block *sb, int *flags, char *data) { + int new_rdonly; struct msdos_sb_info *sbi = MSDOS_SB(sb); *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); + + /* make sure we update state on remount. */ + new_rdonly = *flags & MS_RDONLY; + if (new_rdonly != (sb->s_flags & MS_RDONLY)) { + if (new_rdonly) + fat_set_state(sb, 0, 0); + else + fat_set_state(sb, 1, 1); + } return 0; } @@ -1298,17 +1356,17 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->prev_free = FAT_START_ENT; sb->s_maxbytes = 0xffffffff; - if (!sbi->fat_length && b->fat32_length) { + if (!sbi->fat_length && b->fat32.length) { struct fat_boot_fsinfo *fsinfo; struct buffer_head *fsinfo_bh; /* Must be FAT32 */ sbi->fat_bits = 32; - sbi->fat_length = le32_to_cpu(b->fat32_length); - sbi->root_cluster = le32_to_cpu(b->root_cluster); + sbi->fat_length = le32_to_cpu(b->fat32.length); + sbi->root_cluster = le32_to_cpu(b->fat32.root_cluster); /* MC - if info_sector is 0, don't multiply by 0 */ - sbi->fsinfo_sector = le16_to_cpu(b->info_sector); + sbi->fsinfo_sector = le16_to_cpu(b->fat32.info_sector); if (sbi->fsinfo_sector == 0) sbi->fsinfo_sector = 1; @@ -1362,6 +1420,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, if (sbi->fat_bits != 32) sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12; + /* some OSes set FAT_STATE_DIRTY and clean it on unmount. */ + if (sbi->fat_bits == 32) + sbi->dirty = b->fat32.state & FAT_STATE_DIRTY; + else /* fat 16 or 12 */ + sbi->dirty = b->fat16.state & FAT_STATE_DIRTY; + /* check that FAT table does not overflow */ fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); @@ -1456,6 +1520,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, "the device does not support discard"); } + fat_set_state(sb, 1, 0); return 0; out_invalid: diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index e2cfda94a28d..081b759cff83 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -668,6 +668,7 @@ static struct file_system_type msdos_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("msdos"); static int __init init_msdos_fs(void) { diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index ac959d655e7d..2da952036a3d 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1073,6 +1073,7 @@ static struct file_system_type vfat_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("vfat"); static int __init init_vfat_fs(void) { diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index ef4b5faba87b..499c10438ca2 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -21,13 +21,12 @@ static struct inode *fat_dget(struct super_block *sb, int i_logstart) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct hlist_head *head; - struct hlist_node *_p; struct msdos_inode_info *i; struct inode *inode = NULL; head = sbi->dir_hashtable + fat_dir_hash(i_logstart); spin_lock(&sbi->dir_hash_lock); - hlist_for_each_entry(i, _p, head, i_dir_hash) { + hlist_for_each_entry(i, head, i_dir_hash) { BUG_ON(i->vfs_inode.i_sb != sb); if (i->i_logstart != i_logstart) continue; diff --git a/fs/fcntl.c b/fs/fcntl.c index 71a600a19f06..6599222536eb 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -30,7 +30,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) { - struct inode * inode = filp->f_path.dentry->d_inode; + struct inode * inode = file_inode(filp); int error = 0; /* diff --git a/fs/file.c b/fs/file.c index 2b3570b7caeb..3906d9577a18 100644 --- a/fs/file.c +++ b/fs/file.c @@ -516,7 +516,7 @@ struct files_struct init_files = { .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, }, - .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), + .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), }; /* diff --git a/fs/file_table.c b/fs/file_table.c index de9e9653d611..cd4d87a82951 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -94,8 +94,8 @@ int proc_nr_files(ctl_table *table, int write, #endif /* Find an unused file structure and return a pointer to it. - * Returns NULL, if there are no more free file structures or - * we run out of memory. + * Returns an error pointer if some error happend e.g. we over file + * structures limit, run out of memory or operation is not permitted. * * Be very careful using this. You are responsible for * getting write access to any mount that you might assign @@ -107,7 +107,8 @@ struct file *get_empty_filp(void) { const struct cred *cred = current_cred(); static long old_max; - struct file * f; + struct file *f; + int error; /* * Privileged users can go above max_files @@ -122,13 +123,16 @@ struct file *get_empty_filp(void) } f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); - if (f == NULL) - goto fail; + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); percpu_counter_inc(&nr_files); f->f_cred = get_cred(cred); - if (security_file_alloc(f)) - goto fail_sec; + error = security_file_alloc(f); + if (unlikely(error)) { + file_free(f); + return ERR_PTR(error); + } INIT_LIST_HEAD(&f->f_u.fu_list); atomic_long_set(&f->f_count, 1); @@ -144,12 +148,7 @@ over: pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } - goto fail; - -fail_sec: - file_free(f); -fail: - return NULL; + return ERR_PTR(-ENFILE); } /** @@ -173,10 +172,11 @@ struct file *alloc_file(struct path *path, fmode_t mode, struct file *file; file = get_empty_filp(); - if (!file) - return NULL; + if (IS_ERR(file)) + return file; file->f_path = *path; + file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_mode = mode; file->f_op = fop; @@ -259,6 +259,7 @@ static void __fput(struct file *file) drop_file_write_access(file); file->f_path.dentry = NULL; file->f_path.mnt = NULL; + file->f_inode = NULL; file_free(file); dput(dentry); mntput(mnt); @@ -447,7 +448,7 @@ void mark_files_ro(struct super_block *sb) lg_global_lock(&files_lglock); do_file_list_for_each_entry(sb, f) { - if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) + if (!S_ISREG(file_inode(f)->i_mode)) continue; if (!file_count(f)) continue; diff --git a/fs/filesystems.c b/fs/filesystems.c index da165f6adcbf..92567d95ba6a 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -273,7 +273,7 @@ struct file_system_type *get_fs_type(const char *name) int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); - if (!fs && (request_module("%.*s", len, name) == 0)) + if (!fs && (request_module("fs-%.*s", len, name) == 0)) fs = __get_fs_type(name, len); if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index bd447e88f208..664b07a53870 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -237,7 +237,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags) static int vxfs_readdir(struct file *fp, void *retp, filldir_t filler) { - struct inode *ip = fp->f_path.dentry->d_inode; + struct inode *ip = file_inode(fp); struct super_block *sbp = ip->i_sb; u_long bsize = sbp->s_blocksize; u_long page, npages, block, pblocks, nblocks, offset; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index fed2c8afb3a9..e37eb274e492 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -52,7 +52,6 @@ MODULE_AUTHOR("Christoph Hellwig"); MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */ static void vxfs_put_super(struct super_block *); @@ -258,6 +257,8 @@ static struct file_system_type vxfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("vxfs"); /* makes mount -t vxfs autoload the module */ +MODULE_ALIAS("vxfs"); static int __init vxfs_init(void) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 310972b72a66..21f46fb3a101 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -318,8 +318,14 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) static int write_inode(struct inode *inode, struct writeback_control *wbc) { - if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, wbc); + int ret; + + if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { + trace_writeback_write_inode_start(inode, wbc); + ret = inode->i_sb->s_op->write_inode(inode, wbc); + trace_writeback_write_inode(inode, wbc); + return ret; + } return 0; } @@ -450,6 +456,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) WARN_ON(!(inode->i_state & I_SYNC)); + trace_writeback_single_inode_start(inode, wbc, nr_to_write); + ret = do_writepages(mapping, wbc); /* @@ -1150,8 +1158,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) * dirty the inode itself */ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + trace_writeback_dirty_inode_start(inode, flags); + if (sb->s_op->dirty_inode) sb->s_op->dirty_inode(inode, flags); + + trace_writeback_dirty_inode(inode, flags); } /* @@ -1332,47 +1344,43 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) EXPORT_SYMBOL(writeback_inodes_sb); /** - * writeback_inodes_sb_if_idle - start writeback if none underway + * try_to_writeback_inodes_sb_nr - try to start writeback if none underway * @sb: the superblock - * @reason: reason why some writeback work was initiated + * @nr: the number of pages to write + * @reason: the reason of writeback * - * Invoke writeback_inodes_sb if no writeback is currently underway. + * Invoke writeback_inodes_sb_nr if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ -int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason) +int try_to_writeback_inodes_sb_nr(struct super_block *sb, + unsigned long nr, + enum wb_reason reason) { - if (!writeback_in_progress(sb->s_bdi)) { - down_read(&sb->s_umount); - writeback_inodes_sb(sb, reason); - up_read(&sb->s_umount); + if (writeback_in_progress(sb->s_bdi)) return 1; - } else + + if (!down_read_trylock(&sb->s_umount)) return 0; + + writeback_inodes_sb_nr(sb, nr, reason); + up_read(&sb->s_umount); + return 1; } -EXPORT_SYMBOL(writeback_inodes_sb_if_idle); +EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); /** - * writeback_inodes_sb_nr_if_idle - start writeback if none underway + * try_to_writeback_inodes_sb - try to start writeback if none underway * @sb: the superblock - * @nr: the number of pages to write * @reason: reason why some writeback work was initiated * - * Invoke writeback_inodes_sb if no writeback is currently underway. + * Implement by try_to_writeback_inodes_sb_nr() * Returns 1 if writeback was started, 0 if not. */ -int writeback_inodes_sb_nr_if_idle(struct super_block *sb, - unsigned long nr, - enum wb_reason reason) +int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { - if (!writeback_in_progress(sb->s_bdi)) { - down_read(&sb->s_umount); - writeback_inodes_sb_nr(sb, nr, reason); - up_read(&sb->s_umount); - return 1; - } else - return 0; + return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } -EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); +EXPORT_SYMBOL(try_to_writeback_inodes_sb); /** * sync_inodes_sb - sync sb inode pages diff --git a/fs/fs_struct.c b/fs/fs_struct.c index fe6ca583bbc0..d8ac61d0c932 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -10,7 +10,7 @@ * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. * It can block. */ -void set_fs_root(struct fs_struct *fs, struct path *path) +void set_fs_root(struct fs_struct *fs, const struct path *path) { struct path old_root; @@ -29,7 +29,7 @@ void set_fs_root(struct fs_struct *fs, struct path *path) * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. * It can block. */ -void set_fs_pwd(struct fs_struct *fs, struct path *path) +void set_fs_pwd(struct fs_struct *fs, const struct path *path) { struct path old_pwd; @@ -53,7 +53,7 @@ static inline int replace_path(struct path *p, const struct path *old, const str return 1; } -void chroot_fs_refs(struct path *old_root, struct path *new_root) +void chroot_fs_refs(const struct path *old_root, const struct path *new_root) { struct task_struct *g, *p; struct fs_struct *fs; diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 8dcb114758e3..e2cba1f60c21 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -237,13 +237,12 @@ static int fscache_alloc_object(struct fscache_cache *cache, struct fscache_cookie *cookie) { struct fscache_object *object; - struct hlist_node *_n; int ret; _enter("%p,%p{%s}", cache, cookie, cookie->def->name); spin_lock(&cookie->lock); - hlist_for_each_entry(object, _n, &cookie->backing_objects, + hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { if (object->cache == cache) goto object_already_extant; @@ -311,7 +310,6 @@ static int fscache_attach_object(struct fscache_cookie *cookie, { struct fscache_object *p; struct fscache_cache *cache = object->cache; - struct hlist_node *_n; int ret; _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id); @@ -321,7 +319,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie, /* there may be multiple initial creations of this object, but we only * want one */ ret = -EEXIST; - hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) { + hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) { if (p->cache == object->cache) { if (p->state >= FSCACHE_OBJECT_DYING) ret = -ENOBUFS; @@ -331,7 +329,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie, /* pin the parent object */ spin_lock_nested(&cookie->parent->lock, 1); - hlist_for_each_entry(p, _n, &cookie->parent->backing_objects, + hlist_for_each_entry(p, &cookie->parent->backing_objects, cookie_link) { if (p->cache == object->cache) { if (p->state >= FSCACHE_OBJECT_DYING) { @@ -435,7 +433,6 @@ EXPORT_SYMBOL(__fscache_wait_on_invalidate); void __fscache_update_cookie(struct fscache_cookie *cookie) { struct fscache_object *object; - struct hlist_node *_p; fscache_stat(&fscache_n_updates); @@ -452,7 +449,7 @@ void __fscache_update_cookie(struct fscache_cookie *cookie) spin_lock(&cookie->lock); /* update the index entry on disk in each cache backing this cookie */ - hlist_for_each_entry(object, _p, + hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); } diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 75a20c092dd4..a0b0855d00a9 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -23,7 +23,7 @@ static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) { struct fuse_conn *fc; mutex_lock(&fuse_mutex); - fc = file->f_path.dentry->d_inode->i_private; + fc = file_inode(file)->i_private; if (fc) fc = fuse_conn_get(fc); mutex_unlock(&fuse_mutex); @@ -341,6 +341,7 @@ static struct file_system_type fuse_ctl_fs_type = { .mount = fuse_ctl_mount, .kill_sb = fuse_ctl_kill_sb, }; +MODULE_ALIAS_FS("fusectl"); int __init fuse_ctl_init(void) { diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index e397b675b029..6f96a8def147 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -91,19 +91,22 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { loff_t pos = 0; + struct iovec iov = { .iov_base = buf, .iov_len = count }; - return fuse_direct_io(file, buf, count, &pos, 0); + return fuse_direct_io(file, &iov, 1, count, &pos, 0); } static ssize_t cuse_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { loff_t pos = 0; + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + /* * No locking or generic_write_checks(), the server is * responsible for locking and sanity checks. */ - return fuse_direct_io(file, buf, count, &pos, 1); + return fuse_direct_io(file, &iov, 1, count, &pos, 1); } static int cuse_open(struct inode *inode, struct file *file) @@ -419,7 +422,7 @@ static int cuse_send_init(struct cuse_conn *cc) BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); - req = fuse_get_req(fc); + req = fuse_get_req(fc, 1); if (IS_ERR(req)) { rc = PTR_ERR(req); goto err; @@ -449,6 +452,7 @@ static int cuse_send_init(struct cuse_conn *cc) req->out.argvar = 1; req->out.argpages = 1; req->pages[0] = page; + req->page_descs[0].length = req->out.args[1].size; req->num_pages = 1; req->end = cuse_process_init_reply; fuse_request_send_background(fc, req); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e83351aa5bad..11dfa0c3fb46 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -34,34 +34,67 @@ static struct fuse_conn *fuse_get_conn(struct file *file) return file->private_data; } -static void fuse_request_init(struct fuse_req *req) +static void fuse_request_init(struct fuse_req *req, struct page **pages, + struct fuse_page_desc *page_descs, + unsigned npages) { memset(req, 0, sizeof(*req)); + memset(pages, 0, sizeof(*pages) * npages); + memset(page_descs, 0, sizeof(*page_descs) * npages); INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); atomic_set(&req->count, 1); + req->pages = pages; + req->page_descs = page_descs; + req->max_pages = npages; } -struct fuse_req *fuse_request_alloc(void) +static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL); - if (req) - fuse_request_init(req); + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags); + if (req) { + struct page **pages; + struct fuse_page_desc *page_descs; + + if (npages <= FUSE_REQ_INLINE_PAGES) { + pages = req->inline_pages; + page_descs = req->inline_page_descs; + } else { + pages = kmalloc(sizeof(struct page *) * npages, flags); + page_descs = kmalloc(sizeof(struct fuse_page_desc) * + npages, flags); + } + + if (!pages || !page_descs) { + kfree(pages); + kfree(page_descs); + kmem_cache_free(fuse_req_cachep, req); + return NULL; + } + + fuse_request_init(req, pages, page_descs, npages); + } return req; } + +struct fuse_req *fuse_request_alloc(unsigned npages) +{ + return __fuse_request_alloc(npages, GFP_KERNEL); +} EXPORT_SYMBOL_GPL(fuse_request_alloc); -struct fuse_req *fuse_request_alloc_nofs(void) +struct fuse_req *fuse_request_alloc_nofs(unsigned npages) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS); - if (req) - fuse_request_init(req); - return req; + return __fuse_request_alloc(npages, GFP_NOFS); } void fuse_request_free(struct fuse_req *req) { + if (req->pages != req->inline_pages) { + kfree(req->pages); + kfree(req->page_descs); + } kmem_cache_free(fuse_req_cachep, req); } @@ -97,7 +130,7 @@ static void fuse_req_init_context(struct fuse_req *req) req->in.h.pid = current->pid; } -struct fuse_req *fuse_get_req(struct fuse_conn *fc) +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) { struct fuse_req *req; sigset_t oldset; @@ -116,7 +149,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc) if (!fc->connected) goto out; - req = fuse_request_alloc(); + req = fuse_request_alloc(npages); err = -ENOMEM; if (!req) goto out; @@ -165,7 +198,7 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) struct fuse_file *ff = file->private_data; spin_lock(&fc->lock); - fuse_request_init(req); + fuse_request_init(req, req->pages, req->page_descs, req->max_pages); BUG_ON(ff->reserved_req); ff->reserved_req = req; wake_up_all(&fc->reserved_req_waitq); @@ -186,13 +219,14 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) * filesystem should not have it's own file open. If deadlock is * intentional, it can still be broken by "aborting" the filesystem. */ -struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file) +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, + struct file *file) { struct fuse_req *req; atomic_inc(&fc->num_waiting); wait_event(fc->blocked_waitq, !fc->blocked); - req = fuse_request_alloc(); + req = fuse_request_alloc(0); if (!req) req = get_reserved_req(fc, file); @@ -406,9 +440,8 @@ __acquires(fc->lock) } } -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { - req->isreply = 1; spin_lock(&fc->lock); if (!fc->connected) req->out.h.error = -ENOTCONN; @@ -425,6 +458,12 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) } spin_unlock(&fc->lock); } + +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +{ + req->isreply = 1; + __fuse_request_send(fc, req); +} EXPORT_SYMBOL_GPL(fuse_request_send); static void fuse_request_send_nowait_locked(struct fuse_conn *fc, @@ -491,6 +530,27 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, fuse_request_send_nowait_locked(fc, req); } +void fuse_force_forget(struct file *file, u64 nodeid) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_forget_in inarg; + + memset(&inarg, 0, sizeof(inarg)); + inarg.nlookup = 1; + req = fuse_get_req_nofail_nopages(fc, file); + req->in.h.opcode = FUSE_FORGET; + req->in.h.nodeid = nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->isreply = 0; + __fuse_request_send(fc, req); + /* ignore errors */ + fuse_put_request(fc, req); +} + /* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already @@ -850,11 +910,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, { unsigned i; struct fuse_req *req = cs->req; - unsigned offset = req->page_offset; - unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { int err; + unsigned offset = req->page_descs[i].offset; + unsigned count = min(nbytes, req->page_descs[i].length); err = fuse_copy_page(cs, &req->pages[i], offset, count, zeroing); @@ -862,8 +922,6 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, return err; nbytes -= count; - count = min(nbytes, (unsigned) PAGE_SIZE); - offset = 0; } return 0; } @@ -1536,29 +1594,34 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; + int num_pages; + + offset = outarg->offset & ~PAGE_CACHE_MASK; + file_size = i_size_read(inode); + + num = outarg->size; + if (outarg->offset > file_size) + num = 0; + else if (outarg->offset + num > file_size) + num = file_size - outarg->offset; + + num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ); - req = fuse_get_req(fc); + req = fuse_get_req(fc, num_pages); if (IS_ERR(req)) return PTR_ERR(req); - offset = outarg->offset & ~PAGE_CACHE_MASK; - req->in.h.opcode = FUSE_NOTIFY_REPLY; req->in.h.nodeid = outarg->nodeid; req->in.numargs = 2; req->in.argpages = 1; - req->page_offset = offset; + req->page_descs[0].offset = offset; req->end = fuse_retrieve_end; index = outarg->offset >> PAGE_CACHE_SHIFT; - file_size = i_size_read(inode); - num = outarg->size; - if (outarg->offset > file_size) - num = 0; - else if (outarg->offset + num > file_size) - num = file_size - outarg->offset; - while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) { + while (num && req->num_pages < num_pages) { struct page *page; unsigned int this_num; @@ -1568,6 +1631,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = this_num; req->num_pages++; offset = 0; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b7c09f9eb40c..ff15522481d4 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,6 +14,29 @@ #include <linux/namei.h> #include <linux/slab.h> +static bool fuse_use_readdirplus(struct inode *dir, struct file *filp) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_inode *fi = get_fuse_inode(dir); + + if (!fc->do_readdirplus) + return false; + if (!fc->readdirplus_auto) + return true; + if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) + return true; + if (filp->f_pos == 0) + return true; + return false; +} + +static void fuse_advise_use_readdirplus(struct inode *dir) +{ + struct fuse_inode *fi = get_fuse_inode(dir); + + set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); +} + #if BITS_PER_LONG >= 64 static inline void fuse_dentry_settime(struct dentry *entry, u64 time) { @@ -178,7 +201,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) return -ECHILD; fc = get_fuse_conn(inode); - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return 0; @@ -219,6 +242,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) attr_version); fuse_change_entry_timeout(entry, &outarg); } + fuse_advise_use_readdirplus(inode); return 1; } @@ -271,7 +295,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, if (name->len > FUSE_NAME_MAX) goto out; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); err = PTR_ERR(req); if (IS_ERR(req)) goto out; @@ -355,6 +379,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, else fuse_invalidate_entry_cache(entry); + fuse_advise_use_readdirplus(dir); return newent; out_iput: @@ -391,7 +416,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (!forget) goto out_err; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); err = PTR_ERR(req); if (IS_ERR(req)) goto out_put_forget_req; @@ -592,7 +617,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, { struct fuse_mknod_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -623,7 +648,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -647,7 +672,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -664,7 +689,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -682,7 +707,14 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) spin_lock(&fc->lock); fi->attr_version = ++fc->attr_version; - drop_nlink(inode); + /* + * If i_nlink == 0 then unlink doesn't make sense, yet this can + * happen if userspace filesystem is careless. It would be + * difficult to enforce correct nlink usage so just ignore this + * condition here + */ + if (inode->i_nlink > 0) + drop_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); @@ -696,7 +728,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -723,7 +755,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, int err; struct fuse_rename_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -776,7 +808,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_link_in inarg; struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -848,7 +880,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, struct fuse_req *req; u64 attr_version; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -985,7 +1017,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, /* * Calling into a user-controlled filesystem gives the filesystem - * daemon ptrace-like capabilities over the requester process. This + * daemon ptrace-like capabilities over the current process. This * means, that the filesystem daemon is able to record the exact * filesystem operations performed, and can also control the behavior * of the requester process in otherwise impossible ways. For example @@ -996,27 +1028,23 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, * for which the owner of the mount has ptrace privilege. This * excludes processes started by other users, suid or sgid processes. */ -int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) +int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; - int ret; if (fc->flags & FUSE_ALLOW_OTHER) return 1; - rcu_read_lock(); - ret = 0; - cred = __task_cred(task); + cred = current_cred(); if (uid_eq(cred->euid, fc->user_id) && uid_eq(cred->suid, fc->user_id) && uid_eq(cred->uid, fc->user_id) && gid_eq(cred->egid, fc->group_id) && gid_eq(cred->sgid, fc->group_id) && gid_eq(cred->gid, fc->group_id)) - ret = 1; - rcu_read_unlock(); + return 1; - return ret; + return 0; } static int fuse_access(struct inode *inode, int mask) @@ -1029,7 +1057,7 @@ static int fuse_access(struct inode *inode, int mask) if (fc->no_access) return 0; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1077,7 +1105,7 @@ static int fuse_permission(struct inode *inode, int mask) bool refreshed = false; int err = 0; - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; /* @@ -1155,19 +1183,157 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file, return 0; } -static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +static int fuse_direntplus_link(struct file *file, + struct fuse_direntplus *direntplus, + u64 attr_version) { int err; + struct fuse_entry_out *o = &direntplus->entry_out; + struct fuse_dirent *dirent = &direntplus->dirent; + struct dentry *parent = file->f_path.dentry; + struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = parent->d_inode; + struct fuse_conn *fc; + struct inode *inode; + + if (!o->nodeid) { + /* + * Unlike in the case of fuse_lookup, zero nodeid does not mean + * ENOENT. Instead, it only means the userspace filesystem did + * not want to return attributes/handle for this entry. + * + * So do nothing. + */ + return 0; + } + + if (name.name[0] == '.') { + /* + * We could potentially refresh the attributes of the directory + * and its parent? + */ + if (name.len == 1) + return 0; + if (name.name[1] == '.' && name.len == 2) + return 0; + } + fc = get_fuse_conn(dir); + + name.hash = full_name_hash(name.name, name.len); + dentry = d_lookup(parent, &name); + if (dentry && dentry->d_inode) { + inode = dentry->d_inode; + if (get_node_id(inode) == o->nodeid) { + struct fuse_inode *fi; + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + + /* + * The other branch to 'found' comes via fuse_iget() + * which bumps nlookup inside + */ + goto found; + } + err = d_invalidate(dentry); + if (err) + goto out; + dput(dentry); + dentry = NULL; + } + + dentry = d_alloc(parent, &name); + err = -ENOMEM; + if (!dentry) + goto out; + + inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, + &o->attr, entry_attr_timeout(o), attr_version); + if (!inode) + goto out; + + alias = d_materialise_unique(dentry, inode); + err = PTR_ERR(alias); + if (IS_ERR(alias)) + goto out; + if (alias) { + dput(dentry); + dentry = alias; + } + +found: + fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), + attr_version); + + fuse_change_entry_timeout(dentry, o); + + err = 0; +out: + if (dentry) + dput(dentry); + return err; +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, + void *dstbuf, filldir_t filldir, u64 attr_version) +{ + struct fuse_direntplus *direntplus; + struct fuse_dirent *dirent; + size_t reclen; + int over = 0; + int ret; + + while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { + direntplus = (struct fuse_direntplus *) buf; + dirent = &direntplus->dirent; + reclen = FUSE_DIRENTPLUS_SIZE(direntplus); + + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + + if (!over) { + /* We fill entries into dstbuf only as much as + it can hold. But we still continue iterating + over remaining entries to link them. If not, + we need to send a FORGET for each of those + which we did not link. + */ + over = filldir(dstbuf, dirent->name, dirent->namelen, + file->f_pos, dirent->ino, + dirent->type); + file->f_pos = dirent->off; + } + + buf += reclen; + nbytes -= reclen; + + ret = fuse_direntplus_link(file, direntplus, attr_version); + if (ret) + fuse_force_forget(file, direntplus->entry_out.nodeid); + } + + return 0; +} + +static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +{ + int plus, err; size_t nbytes; struct page *page; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; + u64 attr_version = 0; if (is_bad_inode(inode)) return -EIO; - req = fuse_get_req(fc); + req = fuse_get_req(fc, 1); if (IS_ERR(req)) return PTR_ERR(req); @@ -1176,17 +1342,34 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) fuse_put_request(fc, req); return -ENOMEM; } + + plus = fuse_use_readdirplus(inode, file); req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; - fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR); + req->page_descs[0].length = PAGE_SIZE; + if (plus) { + attr_version = fuse_get_attr_version(fc); + fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + FUSE_READDIRPLUS); + } else { + fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + FUSE_READDIR); + } fuse_request_send(fc, req); nbytes = req->out.args[0].size; err = req->out.h.error; fuse_put_request(fc, req); - if (!err) - err = parse_dirfile(page_address(page), nbytes, file, dstbuf, - filldir); + if (!err) { + if (plus) { + err = parse_dirplusfile(page_address(page), nbytes, + file, dstbuf, filldir, + attr_version); + } else { + err = parse_dirfile(page_address(page), nbytes, file, + dstbuf, filldir); + } + } __free_page(page); fuse_invalidate_attr(inode); /* atime changed */ @@ -1197,7 +1380,7 @@ static char *read_link(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); char *link; if (IS_ERR(req)) @@ -1391,7 +1574,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, loff_t oldsize; int err; - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) @@ -1410,7 +1593,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, if (attr->ia_valid & ATTR_SIZE) is_truncate = true; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1500,7 +1683,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; return fuse_update_attributes(inode, stat, NULL, NULL); @@ -1518,7 +1701,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name, if (fc->no_setxattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1557,7 +1740,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, if (fc->no_getxattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1603,13 +1786,13 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) struct fuse_getxattr_out outarg; ssize_t ret; - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; if (fc->no_listxattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1654,7 +1837,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f3ab824fa302..34b80ba95bad 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -25,7 +25,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, struct fuse_req *req; int err; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -57,7 +57,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) return NULL; ff->fc = fc; - ff->reserved_req = fuse_request_alloc(); + ff->reserved_req = fuse_request_alloc(0); if (unlikely(!ff->reserved_req)) { kfree(ff); return NULL; @@ -355,7 +355,7 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) static int fuse_flush(struct file *file, fl_owner_t id) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; struct fuse_req *req; @@ -368,7 +368,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; - req = fuse_get_req_nofail(fc, file); + req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); @@ -436,7 +436,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, fuse_sync_writes(inode); - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -544,7 +544,7 @@ static int fuse_readpage(struct file *file, struct page *page) */ fuse_wait_on_page_writeback(inode, page->index); - req = fuse_get_req(fc); + req = fuse_get_req(fc, 1); err = PTR_ERR(req); if (IS_ERR(req)) goto out; @@ -555,6 +555,7 @@ static int fuse_readpage(struct file *file, struct page *page) req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; + req->page_descs[0].length = count; num_read = fuse_send_read(req, file, pos, count, NULL); err = req->out.h.error; fuse_put_request(fc, req); @@ -641,6 +642,7 @@ struct fuse_fill_data { struct fuse_req *req; struct file *file; struct inode *inode; + unsigned nr_pages; }; static int fuse_readpages_fill(void *_data, struct page *page) @@ -656,16 +658,26 @@ static int fuse_readpages_fill(void *_data, struct page *page) (req->num_pages == FUSE_MAX_PAGES_PER_REQ || (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { + int nr_alloc = min_t(unsigned, data->nr_pages, + FUSE_MAX_PAGES_PER_REQ); fuse_send_readpages(req, data->file); - data->req = req = fuse_get_req(fc); + data->req = req = fuse_get_req(fc, nr_alloc); if (IS_ERR(req)) { unlock_page(page); return PTR_ERR(req); } } + + if (WARN_ON(req->num_pages >= req->max_pages)) { + fuse_put_request(fc, req); + return -EIO; + } + page_cache_get(page); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = PAGE_SIZE; req->num_pages++; + data->nr_pages--; return 0; } @@ -676,6 +688,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_data data; int err; + int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ); err = -EIO; if (is_bad_inode(inode)) @@ -683,7 +696,8 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - data.req = fuse_get_req(fc); + data.req = fuse_get_req(fc, nr_alloc); + data.nr_pages = nr_pages; err = PTR_ERR(data.req); if (IS_ERR(data.req)) goto out; @@ -786,7 +800,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, res = fuse_send_write(req, file, pos, count, NULL); - offset = req->page_offset; + offset = req->page_descs[0].offset; count = res; for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; @@ -817,7 +831,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, int err; req->in.argpages = 1; - req->page_offset = offset; + req->page_descs[0].offset = offset; do { size_t tmp; @@ -857,6 +871,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, err = 0; req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = tmp; req->num_pages++; iov_iter_advance(ii, tmp); @@ -869,11 +884,19 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0); + req->num_pages < req->max_pages && offset == 0); return count > 0 ? count : err; } +static inline unsigned fuse_wr_pages(loff_t pos, size_t len) +{ + return min_t(unsigned, + ((pos + len - 1) >> PAGE_CACHE_SHIFT) - + (pos >> PAGE_CACHE_SHIFT) + 1, + FUSE_MAX_PAGES_PER_REQ); +} + static ssize_t fuse_perform_write(struct file *file, struct address_space *mapping, struct iov_iter *ii, loff_t pos) @@ -889,8 +912,9 @@ static ssize_t fuse_perform_write(struct file *file, do { struct fuse_req *req; ssize_t count; + unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii)); - req = fuse_get_req(fc); + req = fuse_get_req(fc, nr_pages); if (IS_ERR(req)) { err = PTR_ERR(req); break; @@ -1023,47 +1047,110 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) } } -static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, +static inline void fuse_page_descs_length_init(struct fuse_req *req, + unsigned index, unsigned nr_pages) +{ + int i; + + for (i = index; i < index + nr_pages; i++) + req->page_descs[i].length = PAGE_SIZE - + req->page_descs[i].offset; +} + +static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) +{ + return (unsigned long)ii->iov->iov_base + ii->iov_offset; +} + +static inline size_t fuse_get_frag_size(const struct iov_iter *ii, + size_t max_size) +{ + return min(iov_iter_single_seg_count(ii), max_size); +} + +static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, size_t *nbytesp, int write) { - size_t nbytes = *nbytesp; - unsigned long user_addr = (unsigned long) buf; - unsigned offset = user_addr & ~PAGE_MASK; - int npages; + size_t nbytes = 0; /* # bytes already packed in req */ /* Special case for kernel I/O: can copy directly into the buffer */ if (segment_eq(get_fs(), KERNEL_DS)) { + unsigned long user_addr = fuse_get_user_addr(ii); + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + if (write) req->in.args[1].value = (void *) user_addr; else req->out.args[0].value = (void *) user_addr; + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; return 0; } - nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); - npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); - npages = get_user_pages_fast(user_addr, npages, !write, req->pages); - if (npages < 0) - return npages; + while (nbytes < *nbytesp && req->num_pages < req->max_pages) { + unsigned npages; + unsigned long user_addr = fuse_get_user_addr(ii); + unsigned offset = user_addr & ~PAGE_MASK; + size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes); + int ret; + + unsigned n = req->max_pages - req->num_pages; + frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT); + + npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + npages = clamp(npages, 1U, n); + + ret = get_user_pages_fast(user_addr, npages, !write, + &req->pages[req->num_pages]); + if (ret < 0) + return ret; - req->num_pages = npages; - req->page_offset = offset; + npages = ret; + frag_size = min_t(size_t, frag_size, + (npages << PAGE_SHIFT) - offset); + iov_iter_advance(ii, frag_size); + + req->page_descs[req->num_pages].offset = offset; + fuse_page_descs_length_init(req, req->num_pages, npages); + + req->num_pages += npages; + req->page_descs[req->num_pages - 1].length -= + (npages << PAGE_SHIFT) - offset - frag_size; + + nbytes += frag_size; + } if (write) req->in.argpages = 1; else req->out.argpages = 1; - nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; - *nbytesp = min(*nbytesp, nbytes); + *nbytesp = nbytes; return 0; } -ssize_t fuse_direct_io(struct file *file, const char __user *buf, - size_t count, loff_t *ppos, int write) +static inline int fuse_iter_npages(const struct iov_iter *ii_p) +{ + struct iov_iter ii = *ii_p; + int npages = 0; + + while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) { + unsigned long user_addr = fuse_get_user_addr(&ii); + unsigned offset = user_addr & ~PAGE_MASK; + size_t frag_size = iov_iter_single_seg_count(&ii); + + npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + iov_iter_advance(&ii, frag_size); + } + + return min(npages, FUSE_MAX_PAGES_PER_REQ); +} + +ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, + unsigned long nr_segs, size_t count, loff_t *ppos, + int write) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -1071,8 +1158,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, loff_t pos = *ppos; ssize_t res = 0; struct fuse_req *req; + struct iov_iter ii; + + iov_iter_init(&ii, iov, nr_segs, count, 0); - req = fuse_get_req(fc); + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1080,7 +1170,7 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, size_t nres; fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); - int err = fuse_get_user_pages(req, buf, &nbytes, write); + int err = fuse_get_user_pages(req, &ii, &nbytes, write); if (err) { res = err; break; @@ -1103,12 +1193,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, count -= nres; res += nres; pos += nres; - buf += nres; if (nres != nbytes) break; if (count) { fuse_put_request(fc, req); - req = fuse_get_req(fc); + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) break; } @@ -1122,31 +1211,40 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, } EXPORT_SYMBOL_GPL(fuse_direct_io); -static ssize_t fuse_direct_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { ssize_t res; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); if (is_bad_inode(inode)) return -EIO; - res = fuse_direct_io(file, buf, count, ppos, 0); + res = fuse_direct_io(file, iov, nr_segs, iov_length(iov, nr_segs), + ppos, 0); fuse_invalidate_attr(inode); return res; } -static ssize_t __fuse_direct_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t fuse_direct_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) { - struct inode *inode = file->f_path.dentry->d_inode; + struct iovec iov = { .iov_base = buf, .iov_len = count }; + return __fuse_direct_read(file, &iov, 1, ppos); +} + +static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct inode *inode = file_inode(file); + size_t count = iov_length(iov, nr_segs); ssize_t res; res = generic_write_checks(file, ppos, &count, 0); if (!res) { - res = fuse_direct_io(file, buf, count, ppos, 1); + res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1); if (res > 0) fuse_write_update_size(inode, *ppos); } @@ -1159,7 +1257,8 @@ static ssize_t __fuse_direct_write(struct file *file, const char __user *buf, static ssize_t fuse_direct_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct inode *inode = file->f_path.dentry->d_inode; + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + struct inode *inode = file_inode(file); ssize_t res; if (is_bad_inode(inode)) @@ -1167,7 +1266,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, /* Don't allow parallel writes to the same file */ mutex_lock(&inode->i_mutex); - res = __fuse_direct_write(file, buf, count, ppos); + res = __fuse_direct_write(file, &iov, 1, ppos); mutex_unlock(&inode->i_mutex); return res; @@ -1272,7 +1371,7 @@ static int fuse_writepage_locked(struct page *page) set_page_writeback(page); - req = fuse_request_alloc_nofs(); + req = fuse_request_alloc_nofs(1); if (!req) goto err; @@ -1293,7 +1392,8 @@ static int fuse_writepage_locked(struct page *page) req->in.argpages = 1; req->num_pages = 1; req->pages[0] = tmp_page; - req->page_offset = 0; + req->page_descs[0].offset = 0; + req->page_descs[0].length = PAGE_SIZE; req->end = fuse_writepage_end; req->inode = inode; @@ -1385,7 +1485,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = { static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; @@ -1443,7 +1543,7 @@ static void fuse_lk_fill(struct fuse_req *req, struct file *file, const struct file_lock *fl, int opcode, pid_t pid, int flock) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; struct fuse_lk_in *arg = &req->misc.lk_in; @@ -1465,13 +1565,13 @@ static void fuse_lk_fill(struct fuse_req *req, struct file *file, static int fuse_getlk(struct file *file, struct file_lock *fl) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; struct fuse_lk_out outarg; int err; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1490,7 +1590,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; @@ -1506,7 +1606,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) if (fl->fl_flags & FL_CLOSE) return 0; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1522,7 +1622,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); int err; @@ -1545,7 +1645,7 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); int err; @@ -1575,7 +1675,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) if (!inode->i_sb->s_bdev || fc->no_bmap) return 0; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return 0; @@ -1602,7 +1702,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) { loff_t retval; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ if (whence == SEEK_CUR || whence == SEEK_SET) @@ -1873,7 +1973,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, num_pages++; } - req = fuse_get_req(fc); + req = fuse_get_req(fc, num_pages); if (IS_ERR(req)) { err = PTR_ERR(req); req = NULL; @@ -1881,6 +1981,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, } memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); req->num_pages = num_pages; + fuse_page_descs_length_init(req, 0, req->num_pages); /* okay, let's send it to the client */ req->in.h.opcode = FUSE_IOCTL; @@ -1978,10 +2079,10 @@ EXPORT_SYMBOL_GPL(fuse_do_ioctl); long fuse_ioctl_common(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; if (is_bad_inode(inode)) @@ -2066,6 +2167,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) return DEFAULT_POLLMASK; poll_wait(file, &ff->poll_wait, wait); + inarg.events = (__u32)poll_requested_events(wait); /* * Ask for notification iff there's someone waiting for it. @@ -2076,7 +2178,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) fuse_register_polled_file(fc, ff); } - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return POLLERR; @@ -2126,41 +2228,6 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc, return 0; } -static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos, int rw) -{ - const struct iovec *vector = iov; - ssize_t ret = 0; - - while (nr_segs > 0) { - void __user *base; - size_t len; - ssize_t nr; - - base = vector->iov_base; - len = vector->iov_len; - vector++; - nr_segs--; - - if (rw == WRITE) - nr = __fuse_direct_write(filp, base, len, ppos); - else - nr = fuse_direct_read(filp, base, len, ppos); - - if (nr < 0) { - if (!ret) - ret = nr; - break; - } - ret += nr; - if (nr != len) - break; - } - - return ret; -} - - static ssize_t fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -2172,7 +2239,10 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, file = iocb->ki_filp; pos = offset; - ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw); + if (rw == WRITE) + ret = __fuse_direct_write(file, iov, nr_segs, &pos); + else + ret = __fuse_direct_read(file, iov, nr_segs, &pos); return ret; } @@ -2194,7 +2264,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (fc->no_fallocate) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e105a53fc72d..6aeba864f070 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -44,6 +44,9 @@ doing the mount will be allowed to access the filesystem */ #define FUSE_ALLOW_OTHER (1 << 1) +/** Number of page pointers embedded in fuse_req */ +#define FUSE_REQ_INLINE_PAGES 1 + /** List of active connections */ extern struct list_head fuse_conn_list; @@ -103,6 +106,15 @@ struct fuse_inode { /** List of writepage requestst (pending or sent) */ struct list_head writepages; + + /** Miscellaneous bits describing inode state */ + unsigned long state; +}; + +/** FUSE inode state bits */ +enum { + /** Advise readdirplus */ + FUSE_I_ADVISE_RDPLUS, }; struct fuse_conn; @@ -200,6 +212,12 @@ struct fuse_out { struct fuse_arg args[3]; }; +/** FUSE page descriptor */ +struct fuse_page_desc { + unsigned int length; + unsigned int offset; +}; + /** The request state */ enum fuse_req_state { FUSE_REQ_INIT = 0, @@ -291,14 +309,23 @@ struct fuse_req { } misc; /** page vector */ - struct page *pages[FUSE_MAX_PAGES_PER_REQ]; + struct page **pages; + + /** page-descriptor vector */ + struct fuse_page_desc *page_descs; + + /** size of the 'pages' array */ + unsigned max_pages; + + /** inline page vector */ + struct page *inline_pages[FUSE_REQ_INLINE_PAGES]; + + /** inline page-descriptor vector */ + struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES]; /** number of pages in vector */ unsigned num_pages; - /** offset of data on first page */ - unsigned page_offset; - /** File used in the request (or NULL) */ struct fuse_file *ff; @@ -487,6 +514,12 @@ struct fuse_conn { /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; + /** Does the filesystem support readdirplus? */ + unsigned do_readdirplus:1; + + /** Does the filesystem want adaptive readdirplus? */ + unsigned readdirplus_auto:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -578,6 +611,9 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); +/* Used by READDIRPLUS */ +void fuse_force_forget(struct file *file, u64 nodeid); + /** * Initialize READ or READDIR request */ @@ -658,9 +694,9 @@ void fuse_ctl_cleanup(void); /** * Allocate a request */ -struct fuse_req *fuse_request_alloc(void); +struct fuse_req *fuse_request_alloc(unsigned npages); -struct fuse_req *fuse_request_alloc_nofs(void); +struct fuse_req *fuse_request_alloc_nofs(unsigned npages); /** * Free a request @@ -668,14 +704,25 @@ struct fuse_req *fuse_request_alloc_nofs(void); void fuse_request_free(struct fuse_req *req); /** - * Get a request, may fail with -ENOMEM + * Get a request, may fail with -ENOMEM, + * caller should specify # elements in req->pages[] explicitly */ -struct fuse_req *fuse_get_req(struct fuse_conn *fc); +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); + +/** + * Get a request, may fail with -ENOMEM, + * useful for callers who doesn't use req->pages[] + */ +static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc) +{ + return fuse_get_req(fc, 0); +} /** * Gets a requests for a file operation, always succeeds */ -struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file); +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, + struct file *file); /** * Decrement reference count of a request. If count goes to zero free @@ -739,9 +786,9 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc); int fuse_valid_type(int m); /** - * Is task allowed to perform filesystem operation? + * Is current process allowed to perform filesystem operation? */ -int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task); +int fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); @@ -776,8 +823,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir); -ssize_t fuse_direct_io(struct file *file, const char __user *buf, - size_t count, loff_t *ppos, int write); +ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, + unsigned long nr_segs, size_t count, loff_t *ppos, + int write); long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags); long fuse_ioctl_common(struct file *file, unsigned int cmd, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 73ca6b72beaf..137185c3884f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -92,6 +92,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->attr_version = 0; fi->writectr = 0; fi->orig_ino = 0; + fi->state = 0; INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); INIT_LIST_HEAD(&fi->writepages); @@ -408,12 +409,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) struct fuse_statfs_out outarg; int err; - if (!fuse_allow_task(fc, current)) { + if (!fuse_allow_current_process(fc)) { buf->f_type = FUSE_SUPER_MAGIC; return 0; } - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -678,7 +679,7 @@ static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len, if (*max_len < len) { *max_len = len; - return 255; + return FILEID_INVALID; } nodeid = get_fuse_inode(inode)->nodeid; @@ -863,6 +864,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->dont_mask = 1; if (arg->flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; + if (arg->flags & FUSE_DO_READDIRPLUS) + fc->do_readdirplus = 1; + if (arg->flags & FUSE_READDIRPLUS_AUTO) + fc->readdirplus_auto = 1; } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -889,7 +894,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | - FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA; + FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | + FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1034,12 +1040,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) /* only now - we want root dentry with NULL ->d_op */ sb->s_d_op = &fuse_dentry_operations; - init_req = fuse_request_alloc(); + init_req = fuse_request_alloc(0); if (!init_req) goto err_put_root; if (is_bdev) { - fc->destroy_req = fuse_request_alloc(); + fc->destroy_req = fuse_request_alloc(0); if (!fc->destroy_req) goto err_free_init_req; } @@ -1111,6 +1117,7 @@ static struct file_system_type fuse_fs_type = { .mount = fuse_mount, .kill_sb = fuse_kill_sb_anon, }; +MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, @@ -1140,6 +1147,7 @@ static struct file_system_type fuseblk_fs_type = { .kill_sb = fuse_kill_sb_blk, .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, }; +MODULE_ALIAS_FS("fuseblk"); static inline int register_fuseblk(void) { diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index f850020ad906..f69ac0af5496 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -237,7 +237,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, return -EINVAL; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return value ? -EACCES : 0; - if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) + if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER)) return -EPERM; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 30de4f2a2ea9..24f414f0ce61 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -51,7 +51,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, continue; if (gfs2_is_jdata(ip)) set_buffer_uptodate(bh); - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); } } @@ -230,16 +230,14 @@ out_ignore: } /** - * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk + * gfs2_writepages - Write a bunch of dirty pages back to disk * @mapping: The mapping to write * @wbc: Write-back control * - * For the data=writeback case we can already ignore buffer heads - * and write whole extents at once. This is a big reduction in the - * number of I/O requests we send and the bmap calls we make in this case. + * Used for both ordered and writeback modes. */ -static int gfs2_writeback_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int gfs2_writepages(struct address_space *mapping, + struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); } @@ -852,7 +850,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, goto failed; } - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); @@ -1102,7 +1100,7 @@ cannot_release: static const struct address_space_operations gfs2_writeback_aops = { .writepage = gfs2_writeback_writepage, - .writepages = gfs2_writeback_writepages, + .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, .write_begin = gfs2_write_begin, @@ -1118,6 +1116,7 @@ static const struct address_space_operations gfs2_writeback_aops = { static const struct address_space_operations gfs2_ordered_aops = { .writepage = gfs2_ordered_writepage, + .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, .write_begin = gfs2_write_begin, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index a68e91bcef3d..5e83657f046e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -22,6 +22,7 @@ #include "meta_io.h" #include "quota.h" #include "rgrp.h" +#include "log.h" #include "super.h" #include "trans.h" #include "dir.h" @@ -93,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, if (!gfs2_is_jdata(ip)) mark_buffer_dirty(bh); if (!gfs2_is_writeback(ip)) - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); if (release) { unlock_page(page); @@ -153,7 +154,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) /* Set up the pointer to the new block */ - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); di = (struct gfs2_dinode *)dibh->b_data; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); @@ -405,7 +406,7 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp, BUG_ON(i < 1); BUG_ON(mp->mp_bh[i] != NULL); mp->mp_bh[i] = gfs2_meta_new(gl, bn); - gfs2_trans_add_bh(gl, mp->mp_bh[i], 1); + gfs2_trans_add_meta(gl, mp->mp_bh[i]); gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN); gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); ptr += offset; @@ -468,7 +469,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, BUG_ON(sheight < 1); BUG_ON(dibh == NULL); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); if (height == sheight) { struct buffer_head *bh; @@ -544,7 +545,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, /* Branching from existing tree */ case ALLOC_GROW_DEPTH: if (i > 1 && i < height) - gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1); + gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); for (; i < height && n > 0; i++, n--) gfs2_indirect_init(mp, ip->i_gl, i, mp->mp_list[i-1], bn++); @@ -556,7 +557,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, case ALLOC_DATA: BUG_ON(n > dblks); BUG_ON(mp->mp_bh[end_of_metadata] == NULL); - gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1); + gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]); dblks = n; ptr = metapointer(end_of_metadata, mp); dblock = bn; @@ -796,8 +797,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, down_write(&ip->i_rw_mutex); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_trans_add_meta(ip->i_gl, bh); bstart = 0; blen = 0; @@ -981,7 +982,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) } if (!gfs2_is_writeback(ip)) - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); zero_user(page, offset, length); mark_buffer_dirty(bh); @@ -1046,7 +1047,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) if (error) goto out; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) { gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); @@ -1098,7 +1099,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) if (error) return error; - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) return error; @@ -1137,11 +1138,12 @@ static int trunc_end(struct gfs2_inode *ip) ip->i_height = 0; ip->i_goal = ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + gfs2_ordered_del_inode(ip); } ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1246,7 +1248,7 @@ static int do_grow(struct inode *inode, u64 size) i_size_write(inode, size); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1286,6 +1288,10 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) inode_dio_wait(inode); + ret = gfs2_rs_alloc(GFS2_I(inode)); + if (ret) + return ret; + oldsize = inode->i_size; if (newsize >= oldsize) return do_grow(inode, newsize); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 9a35670fdc38..c3e82bd23179 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -93,7 +93,7 @@ int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head *bh; bh = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD); gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); *bhp = bh; @@ -127,7 +127,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, if (error) return error; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); if (ip->i_inode.i_size < offset + size) i_size_write(&ip->i_inode, offset + size); @@ -209,7 +209,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, if (error) goto fail; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); memcpy(bh->b_data + o, buf, amount); brelse(bh); @@ -231,7 +231,7 @@ out: i_size_write(&ip->i_inode, offset + copied); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -647,7 +647,7 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh, return; } - gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_trans_add_meta(dip->i_gl, bh); /* If there is no prev entry, this is the first entry in the block. The de_rec_len is already as big as it needs to be. Just zero @@ -690,7 +690,7 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode, offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); totlen = be16_to_cpu(dent->de_rec_len); BUG_ON(offset + name->len > totlen); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); ndent = (struct gfs2_dirent *)((char *)dent + offset); dent->de_rec_len = cpu_to_be16(offset); gfs2_qstr2dirent(name, totlen - offset, ndent); @@ -831,7 +831,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, return NULL; gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); leaf = (struct gfs2_leaf *)bh->b_data; leaf->lf_depth = cpu_to_be16(depth); @@ -916,7 +916,7 @@ static int dir_make_exhash(struct inode *inode) /* We're done with the new leaf block, now setup the new hash table. */ - gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_trans_add_meta(dip->i_gl, dibh); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode)); @@ -976,7 +976,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) return 1; /* can't split */ } - gfs2_trans_add_bh(dip->i_gl, obh, 1); + gfs2_trans_add_meta(dip->i_gl, obh); nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1); if (!nleaf) { @@ -1069,7 +1069,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) error = gfs2_meta_inode_buffer(dip, &dibh); if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { - gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_trans_add_meta(dip->i_gl, dibh); gfs2_add_inode_blocks(&dip->i_inode, 1); gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); @@ -1622,7 +1622,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) return error; } while(1); - gfs2_trans_add_bh(ip->i_gl, obh, 1); + gfs2_trans_add_meta(ip->i_gl, obh); leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth)); if (!leaf) { @@ -1636,7 +1636,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) error = gfs2_meta_inode_buffer(ip, &bh); if (error) return error; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_add_inode_blocks(&ip->i_inode, 1); gfs2_dinode_out(ip, bh->b_data); brelse(bh); @@ -1795,7 +1795,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, if (IS_ERR(dent)) return PTR_ERR(dent); - gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_trans_add_meta(dip->i_gl, bh); gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(new_type); @@ -1804,7 +1804,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, error = gfs2_meta_inode_buffer(dip, &bh); if (error) return error; - gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_trans_add_meta(dip->i_gl, bh); } dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; @@ -1849,7 +1849,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (!ht) return -ENOMEM; - error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_hold(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) goto out; @@ -1917,7 +1917,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (error) goto out_end_trans; - gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_trans_add_meta(dip->i_gl, dibh); /* On the last dealloc, make this a regular file in case we crash. (We don't want to free these blocks a second time.) */ if (last_dealloc) diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 4767774a5f3e..9973df4ff565 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -37,10 +37,10 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len, if (parent && (*len < GFS2_LARGE_FH_SIZE)) { *len = GFS2_LARGE_FH_SIZE; - return 255; + return FILEID_INVALID; } else if (*len < GFS2_SMALL_FH_SIZE) { *len = GFS2_SMALL_FH_SIZE; - return 255; + return FILEID_INVALID; } fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 991ab2d484dd..019f45e45097 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -157,7 +157,7 @@ static const u32 gfs2_to_fsflags[32] = { static int gfs2_get_flags(struct file *filp, u32 __user *ptr) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int error; @@ -217,7 +217,7 @@ void gfs2_set_inode_flags(struct inode *inode) */ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *bh; @@ -276,7 +276,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out_trans_end; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); ip->i_diskflags = new_flags; gfs2_dinode_out(ip, bh->b_data); brelse(bh); @@ -293,7 +293,7 @@ out_drop_write: static int gfs2_set_flags(struct file *filp, u32 __user *ptr) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); u32 fsflags, gfsflags; if (get_user(fsflags, ptr)) @@ -336,7 +336,7 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) { - struct inode *inode = filep->f_dentry->d_inode; + struct inode *inode = file_inode(filep); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift; @@ -386,7 +386,7 @@ static int gfs2_allocate_page_backing(struct page *page) static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned long last_index; @@ -483,7 +483,7 @@ out: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); - wait_on_page_writeback(page); + wait_for_stable_page(page); } sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); @@ -673,8 +673,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; size_t writesize = iov_length(iov, nr_segs); - struct dentry *dentry = file->f_dentry; - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_inode *ip = GFS2_I(file_inode(file)); int ret; ret = gfs2_rs_alloc(ip); @@ -709,7 +708,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, if (unlikely(error)) return error; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) { error = gfs2_unstuff_dinode(ip, NULL); @@ -772,7 +771,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); unsigned int data_blocks = 0, ind_blocks = 0, rblocks; @@ -938,7 +937,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) { struct gfs2_file *fp = file->private_data; struct gfs2_holder *fl_gh = &fp->f_fl_gh; - struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode); + struct gfs2_inode *ip = GFS2_I(file_inode(file)); struct gfs2_glock *gl; unsigned int state; int flags; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 992c5c0cb504..cf3515546739 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -30,6 +30,7 @@ #include <linux/rculist_bl.h> #include <linux/bit_spinlock.h> #include <linux/percpu.h> +#include <linux/list_sort.h> #include "gfs2.h" #include "incore.h" @@ -1376,56 +1377,105 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) gfs2_glock_put(gl); } +static int glock_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct gfs2_glock *gla, *glb; -static int gfs2_shrink_glock_memory(struct shrinker *shrink, - struct shrink_control *sc) + gla = list_entry(a, struct gfs2_glock, gl_lru); + glb = list_entry(b, struct gfs2_glock, gl_lru); + + if (gla->gl_name.ln_number > glb->gl_name.ln_number) + return 1; + if (gla->gl_name.ln_number < glb->gl_name.ln_number) + return -1; + + return 0; +} + +/** + * gfs2_dispose_glock_lru - Demote a list of glocks + * @list: The list to dispose of + * + * Disposing of glocks may involve disk accesses, so that here we sort + * the glocks by number (i.e. disk location of the inodes) so that if + * there are any such accesses, they'll be sent in order (mostly). + * + * Must be called under the lru_lock, but may drop and retake this + * lock. While the lru_lock is dropped, entries may vanish from the + * list, but no new entries will appear on the list (since it is + * private) + */ + +static void gfs2_dispose_glock_lru(struct list_head *list) +__releases(&lru_lock) +__acquires(&lru_lock) { struct gfs2_glock *gl; - int may_demote; - int nr_skipped = 0; - int nr = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - LIST_HEAD(skipped); - if (nr == 0) - goto out; + list_sort(NULL, list, glock_cmp); - if (!(gfp_mask & __GFP_FS)) - return -1; + while(!list_empty(list)) { + gl = list_entry(list->next, struct gfs2_glock, gl_lru); + list_del_init(&gl->gl_lru); + clear_bit(GLF_LRU, &gl->gl_flags); + gfs2_glock_hold(gl); + spin_unlock(&lru_lock); + spin_lock(&gl->gl_spin); + if (demote_ok(gl)) + handle_callback(gl, LM_ST_UNLOCKED, 0); + WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); + smp_mb__after_clear_bit(); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put_nolock(gl); + spin_unlock(&gl->gl_spin); + spin_lock(&lru_lock); + } +} + +/** + * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote + * @nr: The number of entries to scan + * + * This function selects the entries on the LRU which are able to + * be demoted, and then kicks off the process by calling + * gfs2_dispose_glock_lru() above. + */ + +static void gfs2_scan_glock_lru(int nr) +{ + struct gfs2_glock *gl; + LIST_HEAD(skipped); + LIST_HEAD(dispose); spin_lock(&lru_lock); while(nr && !list_empty(&lru_list)) { gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); - list_del_init(&gl->gl_lru); - clear_bit(GLF_LRU, &gl->gl_flags); - atomic_dec(&lru_count); /* Test for being demotable */ if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { - gfs2_glock_hold(gl); - spin_unlock(&lru_lock); - spin_lock(&gl->gl_spin); - may_demote = demote_ok(gl); - if (may_demote) { - handle_callback(gl, LM_ST_UNLOCKED, 0); - nr--; - } - clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_clear_bit(); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put_nolock(gl); - spin_unlock(&gl->gl_spin); - spin_lock(&lru_lock); + list_move(&gl->gl_lru, &dispose); + atomic_dec(&lru_count); + nr--; continue; } - nr_skipped++; - list_add(&gl->gl_lru, &skipped); - set_bit(GLF_LRU, &gl->gl_flags); + + list_move(&gl->gl_lru, &skipped); } list_splice(&skipped, &lru_list); - atomic_add(nr_skipped, &lru_count); + if (!list_empty(&dispose)) + gfs2_dispose_glock_lru(&dispose); spin_unlock(&lru_lock); -out: +} + +static int gfs2_shrink_glock_memory(struct shrinker *shrink, + struct shrink_control *sc) +{ + if (sc->nr_to_scan) { + if (!(sc->gfp_mask & __GFP_FS)) + return -1; + gfs2_scan_glock_lru(sc->nr_to_scan); + } + return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 78d4184ffc7d..444b6503ebc4 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -322,8 +322,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) break; }; - ip->i_inode.i_uid = be32_to_cpu(str->di_uid); - ip->i_inode.i_gid = be32_to_cpu(str->di_gid); + i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); + i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index c373a24fedd9..156e42ec84ea 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -52,7 +52,6 @@ struct gfs2_log_header_host { */ struct gfs2_log_operations { - void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); void (*lo_before_commit) (struct gfs2_sbd *sdp); void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); void (*lo_before_scan) (struct gfs2_jdesc *jd, @@ -341,6 +340,7 @@ enum { GIF_QD_LOCKED = 1, GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, + GIF_ORDERED = 4, }; struct gfs2_inode { @@ -357,6 +357,7 @@ struct gfs2_inode { struct gfs2_rgrpd *i_rgd; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; + struct list_head i_ordered; struct list_head i_trunc_list; __be64 *i_hash_cache; u32 i_entries; @@ -391,7 +392,6 @@ struct gfs2_revoke_replay { }; enum { - QDF_USER = 0, QDF_CHANGE = 1, QDF_LOCKED = 2, QDF_REFRESH = 3, @@ -403,7 +403,7 @@ struct gfs2_quota_data { atomic_t qd_count; - u32 qd_id; + struct kqid qd_id; unsigned long qd_flags; /* QDF_... */ s64 qd_change; @@ -641,6 +641,7 @@ struct gfs2_sbd { wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; + struct completion sd_wdack; struct delayed_work sd_control_work; /* Inode Stuff */ @@ -723,6 +724,7 @@ struct gfs2_sbd { struct list_head sd_log_le_revoke; struct list_head sd_log_le_databuf; struct list_head sd_log_le_ordered; + spinlock_t sd_ordered_lock; atomic_t sd_log_thresh1; atomic_t sd_log_thresh2; @@ -758,10 +760,7 @@ struct gfs2_sbd { unsigned int sd_replayed_blocks; /* For quiescing the filesystem */ - struct gfs2_holder sd_freeze_gh; - struct mutex sd_freeze_lock; - unsigned int sd_freeze_count; char sd_fsname[GFS2_FSNAME_LEN]; char sd_table_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2b6f5698ef18..cc00bd1d1f87 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -368,10 +368,11 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip, struct inode *inode) { if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && - (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) { + (dip->i_inode.i_mode & S_ISUID) && + !uid_eq(dip->i_inode.i_uid, GLOBAL_ROOT_UID)) { if (S_ISDIR(inode->i_mode)) inode->i_mode |= S_ISUID; - else if (dip->i_inode.i_uid != current_fsuid()) + else if (!uid_eq(dip->i_inode.i_uid, current_fsuid())) inode->i_mode &= ~07111; inode->i_uid = dip->i_inode.i_uid; } else @@ -447,7 +448,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, struct timespec tv = CURRENT_TIME; dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); di = (struct gfs2_dinode *)dibh->b_data; @@ -455,8 +456,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); di->di_num.no_addr = cpu_to_be64(ip->i_no_addr); di->di_mode = cpu_to_be32(ip->i_inode.i_mode); - di->di_uid = cpu_to_be32(ip->i_inode.i_uid); - di->di_gid = cpu_to_be32(ip->i_inode.i_gid); + di->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); + di->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode)); di->di_nlink = 0; di->di_size = cpu_to_be64(ip->i_inode.i_size); di->di_blocks = cpu_to_be64(1); @@ -548,7 +549,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) return error; - error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_lock(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) goto fail; @@ -584,7 +585,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_end_trans; set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); return 0; @@ -931,7 +932,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_brelse; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); inc_nlink(&ip->i_inode); ip->i_inode.i_ctime = CURRENT_TIME; ihold(inode); @@ -978,8 +979,8 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, return -EPERM; if ((dip->i_inode.i_mode & S_ISVTX) && - dip->i_inode.i_uid != current_fsuid() && - ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER)) + !uid_eq(dip->i_inode.i_uid, current_fsuid()) && + !uid_eq(ip->i_inode.i_uid, current_fsuid()) && !capable(CAP_FOWNER)) return -EPERM; if (IS_APPEND(&dip->i_inode)) @@ -1412,7 +1413,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_end_trans; ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1580,7 +1581,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - u32 ouid, ogid, nuid, ngid; + kuid_t ouid, nuid; + kgid_t ogid, ngid; int error; ouid = inode->i_uid; @@ -1588,16 +1590,17 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) nuid = attr->ia_uid; ngid = attr->ia_gid; - if (!(attr->ia_valid & ATTR_UID) || ouid == nuid) - ouid = nuid = NO_QUOTA_CHANGE; - if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) - ogid = ngid = NO_QUOTA_CHANGE; + if (!(attr->ia_valid & ATTR_UID) || uid_eq(ouid, nuid)) + ouid = nuid = NO_UID_QUOTA_CHANGE; + if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) + ogid = ngid = NO_GID_QUOTA_CHANGE; error = gfs2_quota_lock(ip, nuid, ngid); if (error) return error; - if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || + !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { error = gfs2_quota_check(ip, nuid, ngid); if (error) goto out_gunlock_q; @@ -1611,7 +1614,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (error) goto out_end_trans; - if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || + !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); gfs2_quota_change(ip, -blocks, ouid, ogid); gfs2_quota_change(ip, blocks, nuid, ngid); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f4beeb9c81c1..9a2ca8be7647 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -482,70 +482,66 @@ static void log_flush_wait(struct gfs2_sbd *sdp) } } -static int bd_cmp(void *priv, struct list_head *a, struct list_head *b) +static int ip_cmp(void *priv, struct list_head *a, struct list_head *b) { - struct gfs2_bufdata *bda, *bdb; + struct gfs2_inode *ipa, *ipb; - bda = list_entry(a, struct gfs2_bufdata, bd_list); - bdb = list_entry(b, struct gfs2_bufdata, bd_list); + ipa = list_entry(a, struct gfs2_inode, i_ordered); + ipb = list_entry(b, struct gfs2_inode, i_ordered); - if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr) + if (ipa->i_no_addr < ipb->i_no_addr) return -1; - if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr) + if (ipa->i_no_addr > ipb->i_no_addr) return 1; return 0; } static void gfs2_ordered_write(struct gfs2_sbd *sdp) { - struct gfs2_bufdata *bd; - struct buffer_head *bh; + struct gfs2_inode *ip; LIST_HEAD(written); - gfs2_log_lock(sdp); - list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp); + spin_lock(&sdp->sd_ordered_lock); + list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp); while (!list_empty(&sdp->sd_log_le_ordered)) { - bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list); - list_move(&bd->bd_list, &written); - bh = bd->bd_bh; - if (!buffer_dirty(bh)) + ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); + list_move(&ip->i_ordered, &written); + if (ip->i_inode.i_mapping->nrpages == 0) continue; - get_bh(bh); - gfs2_log_unlock(sdp); - lock_buffer(bh); - if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { - bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE_SYNC, bh); - } else { - unlock_buffer(bh); - brelse(bh); - } - gfs2_log_lock(sdp); + spin_unlock(&sdp->sd_ordered_lock); + filemap_fdatawrite(ip->i_inode.i_mapping); + spin_lock(&sdp->sd_ordered_lock); } list_splice(&written, &sdp->sd_log_le_ordered); - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ordered_lock); } static void gfs2_ordered_wait(struct gfs2_sbd *sdp) { - struct gfs2_bufdata *bd; - struct buffer_head *bh; + struct gfs2_inode *ip; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ordered_lock); while (!list_empty(&sdp->sd_log_le_ordered)) { - bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list); - bh = bd->bd_bh; - if (buffer_locked(bh)) { - get_bh(bh); - gfs2_log_unlock(sdp); - wait_on_buffer(bh); - brelse(bh); - gfs2_log_lock(sdp); + ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); + list_del(&ip->i_ordered); + WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags)); + if (ip->i_inode.i_mapping->nrpages == 0) continue; - } - list_del_init(&bd->bd_list); + spin_unlock(&sdp->sd_ordered_lock); + filemap_fdatawait(ip->i_inode.i_mapping); + spin_lock(&sdp->sd_ordered_lock); } - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ordered_lock); +} + +void gfs2_ordered_del_inode(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + spin_lock(&sdp->sd_ordered_lock); + if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags)) + list_del(&ip->i_ordered); + spin_unlock(&sdp->sd_ordered_lock); } /** diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 3fd5215ea25f..3566f35915e0 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -48,6 +48,18 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, sdp->sd_log_head = sdp->sd_log_tail = value; } +static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + if (!test_bit(GIF_ORDERED, &ip->i_flags)) { + spin_lock(&sdp->sd_ordered_lock); + if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags)) + list_add(&ip->i_ordered, &sdp->sd_log_le_ordered); + spin_unlock(&sdp->sd_ordered_lock); + } +} +extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 9ceccb1595a3..a5055977a214 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -37,7 +37,7 @@ * * The log lock must be held when calling this function */ -static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) { struct gfs2_bufdata *bd; @@ -388,32 +388,6 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, return page; } -static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - struct gfs2_meta_header *mh; - struct gfs2_trans *tr; - - tr = current->journal_info; - tr->tr_touched = 1; - if (!list_empty(&bd->bd_list)) - return; - set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); - set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); - mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; - if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { - printk(KERN_ERR - "Attempting to add uninitialised block to journal (inplace block=%lld)\n", - (unsigned long long)bd->bd_bh->b_blocknr); - BUG(); - } - gfs2_pin(sdp, bd->bd_bh); - mh->__pad0 = cpu_to_be64(0); - mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); - sdp->sd_log_num_buf++; - list_add(&bd->bd_list, &sdp->sd_log_le_buf); - tr->tr_num_buf_new++; -} - static void gfs2_check_magic(struct buffer_head *bh) { void *kaddr; @@ -600,20 +574,6 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); } -static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - struct gfs2_glock *gl = bd->bd_gl; - struct gfs2_trans *tr; - - tr = current->journal_info; - tr->tr_touched = 1; - tr->tr_num_revoke++; - sdp->sd_log_num_revoke++; - atomic_inc(&gl->gl_revokes); - set_bit(GLF_LFLUSH, &gl->gl_flags); - list_add(&bd->bd_list, &sdp->sd_log_le_revoke); -} - static void revoke_lo_before_commit(struct gfs2_sbd *sdp) { struct gfs2_meta_header *mh; @@ -749,44 +709,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) } /** - * databuf_lo_add - Add a databuf to the transaction. - * - * This is used in two distinct cases: - * i) In ordered write mode - * We put the data buffer on a list so that we can ensure that its - * synced to disk at the right time - * ii) In journaled data mode - * We need to journal the data block in the same way as metadata in - * the functions above. The difference is that here we have a tag - * which is two __be64's being the block number (as per meta data) - * and a flag which says whether the data block needs escaping or - * not. This means we need a new log entry for each 251 or so data - * blocks, which isn't an enormous overhead but twice as much as - * for normal metadata blocks. - */ -static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - struct gfs2_trans *tr = current->journal_info; - struct address_space *mapping = bd->bd_bh->b_page->mapping; - struct gfs2_inode *ip = GFS2_I(mapping->host); - - if (tr) - tr->tr_touched = 1; - if (!list_empty(&bd->bd_list)) - return; - set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); - set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); - if (gfs2_is_jdata(ip)) { - gfs2_pin(sdp, bd->bd_bh); - tr->tr_num_databuf_new++; - sdp->sd_log_num_databuf++; - list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf); - } else { - list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered); - } -} - -/** * databuf_lo_before_commit - Scan the data buffers, writing as we go * */ @@ -885,7 +807,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) const struct gfs2_log_operations gfs2_buf_lops = { - .lo_add = buf_lo_add, .lo_before_commit = buf_lo_before_commit, .lo_after_commit = buf_lo_after_commit, .lo_before_scan = buf_lo_before_scan, @@ -895,7 +816,6 @@ const struct gfs2_log_operations gfs2_buf_lops = { }; const struct gfs2_log_operations gfs2_revoke_lops = { - .lo_add = revoke_lo_add, .lo_before_commit = revoke_lo_before_commit, .lo_after_commit = revoke_lo_after_commit, .lo_before_scan = revoke_lo_before_scan, @@ -909,7 +829,6 @@ const struct gfs2_log_operations gfs2_rg_lops = { }; const struct gfs2_log_operations gfs2_databuf_lops = { - .lo_add = databuf_lo_add, .lo_before_commit = databuf_lo_before_commit, .lo_after_commit = databuf_lo_after_commit, .lo_scan_elements = databuf_lo_scan_elements, diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 954a330585f4..ba77b7da8325 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -29,6 +29,7 @@ extern const struct gfs2_log_operations gfs2_databuf_lops; extern const struct gfs2_log_operations *gfs2_log_ops[]; extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw); +extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { @@ -46,19 +47,6 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) return limit; } -static inline void lops_init_le(struct gfs2_bufdata *bd, - const struct gfs2_log_operations *lops) -{ - INIT_LIST_HEAD(&bd->bd_list); - bd->bd_ops = lops; -} - -static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - if (bd->bd_ops->lo_add) - bd->bd_ops->lo_add(sdp, bd); -} - static inline void lops_before_commit(struct gfs2_sbd *sdp) { int x; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 22255d96b27e..b059bbb5059e 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -271,41 +271,6 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) return 0; } -/** - * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer - * @gl: the glock the buffer belongs to - * @bh: The buffer to be attached to - * @meta: Flag to indicate whether its metadata or not - */ - -void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, - int meta) -{ - struct gfs2_bufdata *bd; - - if (meta) - lock_page(bh->b_page); - - if (bh->b_private) { - if (meta) - unlock_page(bh->b_page); - return; - } - - bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); - bd->bd_bh = bh; - bd->bd_gl = gl; - - if (meta) - lops_init_le(bd, &gfs2_buf_lops); - else - lops_init_le(bd, &gfs2_databuf_lops); - bh->b_private = bd; - - if (meta) - unlock_page(bh->b_page); -} - void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) { struct address_space *mapping = bh->b_page->mapping; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index c30973b07a7c..0d4c843b6f8e 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -56,9 +56,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); -void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, - int meta); - void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 0e3554edb8f2..60ede2a0f43f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -20,6 +20,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/quotaops.h> #include <linux/lockdep.h> +#include <linux/module.h> #include "gfs2.h" #include "incore.h" @@ -81,6 +82,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_waitqueue_head(&sdp->sd_glock_wait); atomic_set(&sdp->sd_glock_disposal, 0); init_completion(&sdp->sd_locking_init); + init_completion(&sdp->sd_wdack); spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); @@ -102,6 +104,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) INIT_LIST_HEAD(&sdp->sd_log_le_revoke); INIT_LIST_HEAD(&sdp->sd_log_le_databuf); INIT_LIST_HEAD(&sdp->sd_log_le_ordered); + spin_lock_init(&sdp->sd_ordered_lock); init_waitqueue_head(&sdp->sd_log_waitq); init_waitqueue_head(&sdp->sd_logd_waitq); @@ -115,8 +118,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) INIT_LIST_HEAD(&sdp->sd_revoke_list); - mutex_init(&sdp->sd_freeze_lock); - return sdp; } @@ -1425,6 +1426,7 @@ struct file_system_type gfs2_fs_type = { .kill_sb = gfs2_kill_sb, .owner = THIS_MODULE, }; +MODULE_ALIAS_FS("gfs2"); struct file_system_type gfs2meta_fs_type = { .name = "gfs2meta", @@ -1432,4 +1434,4 @@ struct file_system_type gfs2meta_fs_type = { .mount = gfs2_mount_meta, .owner = THIS_MODULE, }; - +MODULE_ALIAS_FS("gfs2meta"); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index ae55e248c3b7..c7c840e916f8 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -65,13 +65,10 @@ #include "inode.h" #include "util.h" -#define QUOTA_USER 1 -#define QUOTA_GROUP 0 - struct gfs2_quota_change_host { u64 qc_change; u32 qc_flags; /* GFS2_QCF_... */ - u32 qc_id; + struct kqid qc_id; }; static LIST_HEAD(qd_lru_list); @@ -120,17 +117,24 @@ out: return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100; } +static u64 qd2index(struct gfs2_quota_data *qd) +{ + struct kqid qid = qd->qd_id; + return (2 * (u64)from_kqid(&init_user_ns, qid)) + + (qid.type == USRQUOTA) ? 0 : 1; +} + static u64 qd2offset(struct gfs2_quota_data *qd) { u64 offset; - offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags); + offset = qd2index(qd); offset *= sizeof(struct gfs2_quota); return offset; } -static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, +static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { struct gfs2_quota_data *qd; @@ -141,13 +145,11 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, return -ENOMEM; atomic_set(&qd->qd_count, 1); - qd->qd_id = id; - if (user) - set_bit(QDF_USER, &qd->qd_flags); + qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_reclaim); - error = gfs2_glock_get(sdp, 2 * (u64)id + !user, + error = gfs2_glock_get(sdp, qd2index(qd), &gfs2_quota_glops, CREATE, &qd->qd_gl); if (error) goto fail; @@ -161,7 +163,7 @@ fail: return error; } -static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { struct gfs2_quota_data *qd = NULL, *new_qd = NULL; @@ -173,8 +175,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, found = 0; spin_lock(&qd_lru_lock); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { - if (qd->qd_id == id && - !test_bit(QDF_USER, &qd->qd_flags) == !user) { + if (qid_eq(qd->qd_id, qid)) { if (!atomic_read(&qd->qd_count) && !list_empty(&qd->qd_reclaim)) { /* Remove it from reclaim list */ @@ -208,7 +209,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, return 0; } - error = qd_alloc(sdp, user, id, &new_qd); + error = qd_alloc(sdp, qid, &new_qd); if (error) return error; } @@ -458,12 +459,12 @@ static void qd_unlock(struct gfs2_quota_data *qd) qd_put(qd); } -static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, +static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { int error; - error = qd_get(sdp, user, id, qdp); + error = qd_get(sdp, qid, qdp); if (error) return error; @@ -491,7 +492,7 @@ static void qdsb_put(struct gfs2_quota_data *qd) qd_put(qd); } -int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) +int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data **qd; @@ -512,28 +513,30 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; - error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd); + error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd); if (error) goto out; ip->i_res->rs_qa_qd_num++; qd++; - error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd); + error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd); if (error) goto out; ip->i_res->rs_qa_qd_num++; qd++; - if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) { - error = qdsb_get(sdp, QUOTA_USER, uid, qd); + if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) && + !uid_eq(uid, ip->i_inode.i_uid)) { + error = qdsb_get(sdp, make_kqid_uid(uid), qd); if (error) goto out; ip->i_res->rs_qa_qd_num++; qd++; } - if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) { - error = qdsb_get(sdp, QUOTA_GROUP, gid, qd); + if (!gid_eq(gid, NO_GID_QUOTA_CHANGE) && + !gid_eq(gid, ip->i_inode.i_gid)) { + error = qdsb_get(sdp, make_kqid_gid(gid), qd); if (error) goto out; ip->i_res->rs_qa_qd_num++; @@ -567,18 +570,10 @@ static int sort_qd(const void *a, const void *b) const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a; const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b; - if (!test_bit(QDF_USER, &qd_a->qd_flags) != - !test_bit(QDF_USER, &qd_b->qd_flags)) { - if (test_bit(QDF_USER, &qd_a->qd_flags)) - return -1; - else - return 1; - } - if (qd_a->qd_id < qd_b->qd_id) + if (qid_lt(qd_a->qd_id, qd_b->qd_id)) return -1; - if (qd_a->qd_id > qd_b->qd_id) + if (qid_lt(qd_b->qd_id, qd_a->qd_id)) return 1; - return 0; } @@ -590,14 +585,14 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) s64 x; mutex_lock(&sdp->sd_quota_mutex); - gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1); + gfs2_trans_add_meta(ip->i_gl, qd->qd_bh); if (!test_bit(QDF_CHANGE, &qd->qd_flags)) { qc->qc_change = 0; qc->qc_flags = 0; - if (test_bit(QDF_USER, &qd->qd_flags)) + if (qd->qd_id.type == USRQUOTA) qc->qc_flags = cpu_to_be32(GFS2_QCF_USER); - qc->qc_id = cpu_to_be32(qd->qd_id); + qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id)); } x = be64_to_cpu(qc->qc_change) + change; @@ -726,7 +721,7 @@ get_a_page: goto unlock_out; } - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_meta(ip->i_gl, bh); kaddr = kmap_atomic(page); if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) @@ -925,7 +920,7 @@ fail: return error; } -int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) +int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; @@ -1040,13 +1035,13 @@ static int print_message(struct gfs2_quota_data *qd, char *type) printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n", sdp->sd_fsname, type, - (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group", - qd->qd_id); + (qd->qd_id.type == USRQUOTA) ? "user" : "group", + from_kqid(&init_user_ns, qd->qd_id)); return 0; } -int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) +int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; @@ -1063,8 +1058,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { qd = ip->i_res->rs_qa_qd[x]; - if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || - (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags)))) + if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) || + qid_eq(qd->qd_id, make_kqid_gid(gid)))) continue; value = (s64)be64_to_cpu(qd->qd_qb.qb_value); @@ -1074,10 +1069,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { print_message(qd, "exceeded"); - quota_send_warning(make_kqid(&init_user_ns, - test_bit(QDF_USER, &qd->qd_flags) ? - USRQUOTA : GRPQUOTA, - qd->qd_id), + quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); error = -EDQUOT; @@ -1087,10 +1079,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) time_after_eq(jiffies, qd->qd_last_warn + gfs2_tune_get(sdp, gt_quota_warn_period) * HZ)) { - quota_send_warning(make_kqid(&init_user_ns, - test_bit(QDF_USER, &qd->qd_flags) ? - USRQUOTA : GRPQUOTA, - qd->qd_id), + quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); error = print_message(qd, "warning"); qd->qd_last_warn = jiffies; @@ -1101,7 +1090,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) } void gfs2_quota_change(struct gfs2_inode *ip, s64 change, - u32 uid, u32 gid) + kuid_t uid, kgid_t gid) { struct gfs2_quota_data *qd; unsigned int x; @@ -1114,8 +1103,8 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { qd = ip->i_res->rs_qa_qd[x]; - if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || - (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { + if (qid_eq(qd->qd_id, make_kqid_uid(uid)) || + qid_eq(qd->qd_id, make_kqid_gid(gid))) { do_qc(qd, change); } } @@ -1170,13 +1159,13 @@ static int gfs2_quota_sync_timeo(struct super_block *sb, int type) return gfs2_quota_sync(sb, type); } -int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) +int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; struct gfs2_holder q_gh; int error; - error = qd_get(sdp, user, id, &qd); + error = qd_get(sdp, qid, &qd); if (error) return error; @@ -1194,7 +1183,9 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void * qc->qc_change = be64_to_cpu(str->qc_change); qc->qc_flags = be32_to_cpu(str->qc_flags); - qc->qc_id = be32_to_cpu(str->qc_id); + qc->qc_id = make_kqid(&init_user_ns, + (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA, + be32_to_cpu(str->qc_id)); } int gfs2_quota_init(struct gfs2_sbd *sdp) @@ -1257,8 +1248,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) if (!qc.qc_change) continue; - error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER), - qc.qc_id, &qd); + error = qd_alloc(sdp, qc.qc_id, &qd); if (error) { brelse(bh); goto fail; @@ -1485,21 +1475,17 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, struct gfs2_quota_data *qd; struct gfs2_holder q_gh; int error; - int type; memset(fdq, 0, sizeof(struct fs_disk_quota)); if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return -ESRCH; /* Crazy XFS error code */ - if (qid.type == USRQUOTA) - type = QUOTA_USER; - else if (qid.type == GRPQUOTA) - type = QUOTA_GROUP; - else + if ((qid.type != USRQUOTA) && + (qid.type != GRPQUOTA)) return -EINVAL; - error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd); + error = qd_get(sdp, qid, &qd); if (error) return error; error = do_glock(qd, FORCE, &q_gh); @@ -1508,8 +1494,8 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; fdq->d_version = FS_DQUOT_VERSION; - fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; - fdq->d_id = from_kqid(&init_user_ns, qid); + fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA; + fdq->d_id = from_kqid_munged(current_user_ns(), qid); fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift; fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift; fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift; @@ -1535,32 +1521,18 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, int alloc_required; loff_t offset; int error; - int type; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return -ESRCH; /* Crazy XFS error code */ - switch(qid.type) { - case USRQUOTA: - type = QUOTA_USER; - if (fdq->d_flags != FS_USER_QUOTA) - return -EINVAL; - break; - case GRPQUOTA: - type = QUOTA_GROUP; - if (fdq->d_flags != FS_GROUP_QUOTA) - return -EINVAL; - break; - default: + if ((qid.type != USRQUOTA) && + (qid.type != GRPQUOTA)) return -EINVAL; - } if (fdq->d_fieldmask & ~GFS2_FIELDMASK) return -EINVAL; - if (fdq->d_id != from_kqid(&init_user_ns, qid)) - return -EINVAL; - error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd); + error = qd_get(sdp, qid, &qd); if (error) return error; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index f25d98b87904..4f5e6e44ed83 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -14,20 +14,21 @@ struct gfs2_inode; struct gfs2_sbd; struct shrink_control; -#define NO_QUOTA_CHANGE ((u32)-1) +#define NO_UID_QUOTA_CHANGE INVALID_UID +#define NO_GID_QUOTA_CHANGE INVALID_GID -extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); +extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); extern void gfs2_quota_unhold(struct gfs2_inode *ip); -extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); +extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); extern void gfs2_quota_unlock(struct gfs2_inode *ip); -extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); +extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, - u32 uid, u32 gid); + kuid_t uid, kgid_t gid); extern int gfs2_quota_sync(struct super_block *sb, int type); -extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); +extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid); extern int gfs2_quota_init(struct gfs2_sbd *sdp); extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); @@ -41,7 +42,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) int ret; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; - ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) return ret; if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index b7eff078fe90..d1f51fd73f86 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1257,7 +1257,7 @@ fail: int gfs2_fitrim(struct file *filp, void __user *argp) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct gfs2_sbd *sdp = GFS2_SB(inode); struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev); struct buffer_head *bh; @@ -1323,7 +1323,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (ret == 0) { bh = rgd->rd_bits[0].bi_bh; rgd->rd_flags |= GFS2_RGF_TRIMMED; - gfs2_trans_add_bh(rgd->rd_gl, bh, 1); + gfs2_trans_add_meta(rgd->rd_gl, bh); gfs2_rgrp_out(rgd, bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data); gfs2_trans_end(sdp); @@ -1968,14 +1968,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, *n = 1; block = gfs2_rbm_to_block(rbm); - gfs2_trans_add_bh(rbm->rgd->rd_gl, rbm->bi->bi_bh, 1); + gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh); gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); block++; while (*n < elen) { ret = gfs2_rbm_from_block(&pos, block); if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE) break; - gfs2_trans_add_bh(pos.rgd->rd_gl, pos.bi->bi_bh, 1); + gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh); gfs2_setbit(&pos, true, GFS2_BLKST_USED); (*n)++; block++; @@ -2014,7 +2014,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, rbm.bi->bi_bh->b_data + rbm.bi->bi_offset, rbm.bi->bi_len); } - gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.bi->bi_bh, 1); + gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh); gfs2_setbit(&rbm, false, new_state); } @@ -2157,7 +2157,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, if (error == 0) { struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal); brelse(dibh); @@ -2176,7 +2176,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, *generation = rbm.rgd->rd_igeneration++; } - gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh, 1); + gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data); @@ -2223,7 +2223,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); rgd->rd_free += blen; rgd->rd_flags &= ~GFS2_RGF_TRIMMED; - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); @@ -2260,7 +2260,7 @@ void gfs2_unlink_di(struct inode *inode) if (!rgd) return; trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); update_rgrp_lvb_unlinked(rgd, 1); @@ -2281,7 +2281,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) rgd->rd_dinodes--; rgd->rd_free++; - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); update_rgrp_lvb_unlinked(rgd, -1); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index d6488674d916..cab77b8ba84f 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -500,7 +500,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, if (error) return; - gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + gfs2_trans_add_meta(l_ip->i_gl, l_bh); spin_lock(&sdp->sd_statfs_spin); l_sc->sc_total += total; @@ -528,7 +528,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + gfs2_trans_add_meta(l_ip->i_gl, l_bh); spin_lock(&sdp->sd_statfs_spin); m_sc->sc_total += l_sc->sc_total; @@ -539,7 +539,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, 0, sizeof(struct gfs2_statfs_change)); spin_unlock(&sdp->sd_statfs_spin); - gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); + gfs2_trans_add_meta(m_ip->i_gl, m_bh); gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); } @@ -663,54 +663,6 @@ out: return error; } -/** - * gfs2_freeze_fs - freezes the file system - * @sdp: the file system - * - * This function flushes data and meta data for all machines by - * acquiring the transaction log exclusively. All journals are - * ensured to be in a clean state as well. - * - * Returns: errno - */ - -int gfs2_freeze_fs(struct gfs2_sbd *sdp) -{ - int error = 0; - - mutex_lock(&sdp->sd_freeze_lock); - - if (!sdp->sd_freeze_count++) { - error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); - if (error) - sdp->sd_freeze_count--; - } - - mutex_unlock(&sdp->sd_freeze_lock); - - return error; -} - -/** - * gfs2_unfreeze_fs - unfreezes the file system - * @sdp: the file system - * - * This function allows the file system to proceed by unlocking - * the exclusively held transaction lock. Other GFS2 nodes are - * now free to acquire the lock shared and go on with their lives. - * - */ - -void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) -{ - mutex_lock(&sdp->sd_freeze_lock); - - if (sdp->sd_freeze_count && !--sdp->sd_freeze_count) - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); - - mutex_unlock(&sdp->sd_freeze_lock); -} - void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) { struct gfs2_dinode *str = buf; @@ -721,8 +673,8 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); str->di_mode = cpu_to_be32(ip->i_inode.i_mode); - str->di_uid = cpu_to_be32(ip->i_inode.i_uid); - str->di_gid = cpu_to_be32(ip->i_inode.i_gid); + str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); + str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode)); str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); str->di_size = cpu_to_be64(i_size_read(&ip->i_inode)); str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); @@ -824,7 +776,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) ret = gfs2_meta_inode_buffer(ip, &bh); if (ret == 0) { - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_dinode_out(ip, bh->b_data); brelse(bh); } @@ -888,13 +840,6 @@ static void gfs2_put_super(struct super_block *sb) int error; struct gfs2_jdesc *jd; - /* Unfreeze the filesystem, if we need to */ - - mutex_lock(&sdp->sd_freeze_lock); - if (sdp->sd_freeze_count) - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); - mutex_unlock(&sdp->sd_freeze_lock); - /* No more recovery requests */ set_bit(SDF_NORECOVERY, &sdp->sd_flags); smp_mb(); @@ -985,7 +930,7 @@ static int gfs2_freeze(struct super_block *sb) return -EINVAL; for (;;) { - error = gfs2_freeze_fs(sdp); + error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); if (!error) break; @@ -1013,7 +958,9 @@ static int gfs2_freeze(struct super_block *sb) static int gfs2_unfreeze(struct super_block *sb) { - gfs2_unfreeze_fs(sb->s_fs_info); + struct gfs2_sbd *sdp = sb->s_fs_info; + + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); return 0; } @@ -1429,7 +1376,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) if (error) return error; - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) return error; @@ -1577,6 +1524,7 @@ out: /* Case 3 starts here */ truncate_inode_pages(&inode->i_data, 0); gfs2_rs_delete(ip); + gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); ip->i_gl->gl_object = NULL; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index a0464680af0b..90e3322ffa10 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -46,9 +46,6 @@ extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); -extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); -extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); - extern struct file_system_type gfs2_fs_type; extern struct file_system_type gfs2meta_fs_type; extern const struct export_operations gfs2_export_ops; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 8056b7b7238e..aa5c48044966 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -91,39 +91,37 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) { - unsigned int count; - - mutex_lock(&sdp->sd_freeze_lock); - count = sdp->sd_freeze_count; - mutex_unlock(&sdp->sd_freeze_lock); + struct super_block *sb = sdp->sd_vfs; + int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1; - return snprintf(buf, PAGE_SIZE, "%u\n", count); + return snprintf(buf, PAGE_SIZE, "%u\n", frozen); } static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { - ssize_t ret = len; - int error = 0; + int error; int n = simple_strtol(buf, NULL, 0); if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; switch (n) { case 0: - gfs2_unfreeze_fs(sdp); + error = thaw_super(sdp->sd_vfs); break; case 1: - error = gfs2_freeze_fs(sdp); + error = freeze_super(sdp->sd_vfs); break; default: - ret = -EINVAL; + return -EINVAL; } - if (error) + if (error) { fs_warn(sdp, "freeze %d error %d", n, error); + return error; + } - return ret; + return len; } static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) @@ -135,7 +133,7 @@ static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; @@ -150,7 +148,7 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; @@ -163,7 +161,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; @@ -175,30 +173,40 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + struct kqid qid; int error; u32 id; if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; id = simple_strtoul(buf, NULL, 0); - error = gfs2_quota_refresh(sdp, 1, id); + qid = make_kqid(current_user_ns(), USRQUOTA, id); + if (!qid_valid(qid)) + return -EINVAL; + + error = gfs2_quota_refresh(sdp, qid); return error ? error : len; } static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + struct kqid qid; int error; u32 id; if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; id = simple_strtoul(buf, NULL, 0); - error = gfs2_quota_refresh(sdp, 0, id); + qid = make_kqid(current_user_ns(), GRPQUOTA, id); + if (!qid_valid(qid)) + return -EINVAL; + + error = gfs2_quota_refresh(sdp, qid); return error ? error : len; } @@ -213,7 +221,7 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len int rv; if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum, mode); @@ -332,6 +340,28 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) return ret; } +static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf) +{ + int val = completion_done(&sdp->sd_wdack) ? 1 : 0; + + return sprintf(buf, "%d\n", val); +} + +static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + ssize_t ret = len; + int val; + + val = simple_strtol(buf, NULL, 0); + + if ((val == 1) && + !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) + complete(&sdp->sd_wdack); + else + ret = -EINVAL; + return ret; +} + static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -463,7 +493,7 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); GDLM_ATTR(block, 0644, block_show, block_store); -GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GDLM_ATTR(withdraw, 0644, wdack_show, wdack_store); GDLM_ATTR(jid, 0644, jid_show, jid_store); GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store); GDLM_ATTR(first_done, 0444, first_done_show, NULL); @@ -502,7 +532,7 @@ static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf, unsigned int x, y; if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; if (sscanf(buf, "%u %u", &x, &y) != 2 || !y) return -EINVAL; @@ -521,7 +551,7 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, unsigned int x; if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; x = simple_strtoul(buf, NULL, 0); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 413627072f36..88162fae27a5 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -18,6 +18,7 @@ #include "gfs2.h" #include "incore.h" #include "glock.h" +#include "inode.h" #include "log.h" #include "lops.h" #include "meta_io.h" @@ -142,44 +143,143 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) sb_end_intwrite(sdp->sd_vfs); } +static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, + struct buffer_head *bh, + const struct gfs2_log_operations *lops) +{ + struct gfs2_bufdata *bd; + + bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); + bd->bd_bh = bh; + bd->bd_gl = gl; + bd->bd_ops = lops; + INIT_LIST_HEAD(&bd->bd_list); + bh->b_private = bd; + return bd; +} + /** - * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction - * @gl: the glock the buffer belongs to + * gfs2_trans_add_data - Add a databuf to the transaction. + * @gl: The inode glock associated with the buffer * @bh: The buffer to add - * @meta: True in the case of adding metadata * + * This is used in two distinct cases: + * i) In ordered write mode + * We put the data buffer on a list so that we can ensure that its + * synced to disk at the right time + * ii) In journaled data mode + * We need to journal the data block in the same way as metadata in + * the functions above. The difference is that here we have a tag + * which is two __be64's being the block number (as per meta data) + * and a flag which says whether the data block needs escaping or + * not. This means we need a new log entry for each 251 or so data + * blocks, which isn't an enormous overhead but twice as much as + * for normal metadata blocks. */ +void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) +{ + struct gfs2_trans *tr = current->journal_info; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = bh->b_page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_bufdata *bd; -void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) + if (!gfs2_is_jdata(ip)) { + gfs2_ordered_add_inode(ip); + return; + } + + lock_buffer(bh); + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd == NULL) { + gfs2_log_unlock(sdp); + unlock_buffer(bh); + if (bh->b_private == NULL) + bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops); + lock_buffer(bh); + gfs2_log_lock(sdp); + } + gfs2_assert(sdp, bd->bd_gl == gl); + tr->tr_touched = 1; + if (list_empty(&bd->bd_list)) { + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + gfs2_pin(sdp, bd->bd_bh); + tr->tr_num_databuf_new++; + sdp->sd_log_num_databuf++; + list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf); + } + gfs2_log_unlock(sdp); + unlock_buffer(bh); +} + +static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { + struct gfs2_meta_header *mh; + struct gfs2_trans *tr; + + tr = current->journal_info; + tr->tr_touched = 1; + if (!list_empty(&bd->bd_list)) + return; + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; + if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { + printk(KERN_ERR + "Attempting to add uninitialised block to journal (inplace block=%lld)\n", + (unsigned long long)bd->bd_bh->b_blocknr); + BUG(); + } + gfs2_pin(sdp, bd->bd_bh); + mh->__pad0 = cpu_to_be64(0); + mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + sdp->sd_log_num_buf++; + list_add(&bd->bd_list, &sdp->sd_log_le_buf); + tr->tr_num_buf_new++; +} + +void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_bufdata *bd; lock_buffer(bh); gfs2_log_lock(sdp); bd = bh->b_private; - if (bd) - gfs2_assert(sdp, bd->bd_gl == gl); - else { + if (bd == NULL) { gfs2_log_unlock(sdp); unlock_buffer(bh); - gfs2_attach_bufdata(gl, bh, meta); - bd = bh->b_private; + lock_page(bh->b_page); + if (bh->b_private == NULL) + bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops); + unlock_page(bh->b_page); lock_buffer(bh); gfs2_log_lock(sdp); } - lops_add(sdp, bd); + gfs2_assert(sdp, bd->bd_gl == gl); + meta_lo_add(sdp, bd); gfs2_log_unlock(sdp); unlock_buffer(bh); } void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { + struct gfs2_glock *gl = bd->bd_gl; + struct gfs2_trans *tr = current->journal_info; + BUG_ON(!list_empty(&bd->bd_list)); BUG_ON(!list_empty(&bd->bd_ail_st_list)); BUG_ON(!list_empty(&bd->bd_ail_gl_list)); - lops_init_le(bd, &gfs2_revoke_lops); - lops_add(sdp, bd); + bd->bd_ops = &gfs2_revoke_lops; + tr->tr_touched = 1; + tr->tr_num_revoke++; + sdp->sd_log_num_revoke++; + atomic_inc(&gl->gl_revokes); + set_bit(GLF_LFLUSH, &gl->gl_flags); + list_add(&bd->bd_list, &sdp->sd_log_le_revoke); } void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index bf2ae9aeee7a..1e6e7da25a17 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -39,7 +39,8 @@ extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes); extern void gfs2_trans_end(struct gfs2_sbd *sdp); -extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); +extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); +extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index f00d7c5744f6..6402fb69d71b 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -54,6 +54,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); + if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) + wait_for_completion(&sdp->sd_wdack); + if (lm->lm_unmount) { fs_err(sdp, "telling LM to unmount\n"); lm->lm_unmount(sdp); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 76c144b3c9bb..ecd37f30ab91 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -270,7 +270,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, if (error) goto out_gunlock; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); dataptrs = GFS2_EA2DATAPTRS(ea); for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { @@ -309,7 +309,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -331,7 +331,7 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, if (error) return error; - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) goto out_alloc; @@ -509,7 +509,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, } if (din) { - gfs2_trans_add_bh(ip->i_gl, bh[x], 1); + gfs2_trans_add_meta(ip->i_gl, bh[x]); memcpy(pos, din, cp_size); din += sdp->sd_jbsize; } @@ -629,7 +629,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) return error; gfs2_trans_add_unrevoke(sdp, block, 1); *bhp = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, *bhp, 1); + gfs2_trans_add_meta(ip->i_gl, *bhp); gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA); gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header)); @@ -691,7 +691,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, return error; gfs2_trans_add_unrevoke(sdp, block, 1); bh = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED); gfs2_add_inode_blocks(&ip->i_inode, 1); @@ -751,7 +751,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -834,7 +834,7 @@ static void ea_set_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_header *prev = el->el_prev; u32 len; - gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + gfs2_trans_add_meta(ip->i_gl, el->el_bh); if (!prev || !GFS2_EA_IS_STUFFED(ea)) { ea->ea_type = GFS2_EATYPE_UNUSED; @@ -872,7 +872,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, if (error) return error; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); if (es->ea_split) ea = ea_split_ea(ea); @@ -886,7 +886,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, if (error) goto out; ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); out: @@ -901,7 +901,7 @@ static int ea_set_simple_alloc(struct gfs2_inode *ip, struct gfs2_ea_header *ea = es->es_ea; int error; - gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1); + gfs2_trans_add_meta(ip->i_gl, es->es_bh); if (es->ea_split) ea = ea_split_ea(ea); @@ -997,7 +997,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, goto out; } - gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_trans_add_meta(ip->i_gl, indbh); } else { u64 blk; unsigned int n = 1; @@ -1006,7 +1006,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, return error; gfs2_trans_add_unrevoke(sdp, blk, 1); indbh = gfs2_meta_new(ip->i_gl, blk); - gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_trans_add_meta(ip->i_gl, indbh); gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); gfs2_buffer_clear_tail(indbh, mh_size); @@ -1092,7 +1092,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) if (error) return error; - gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + gfs2_trans_add_meta(ip->i_gl, el->el_bh); if (prev) { u32 len; @@ -1109,7 +1109,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1265,7 +1265,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) if (GFS2_EA_IS_STUFFED(el.el_ea)) { error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0); if (error == 0) { - gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); + gfs2_trans_add_meta(ip->i_gl, el.el_bh); memcpy(GFS2_EA2DATA(el.el_ea), data, GFS2_EA_DATA_LEN(el.el_ea)); } @@ -1352,7 +1352,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) if (error) goto out_gunlock; - gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_trans_add_meta(ip->i_gl, indbh); eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); bstart = 0; @@ -1384,7 +1384,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1434,7 +1434,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1461,7 +1461,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) if (error) return error; - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) return error; diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index b77c5bc20f8a..998e3a6decf3 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -1,6 +1,6 @@ config HFS_FS - tristate "Apple Macintosh file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + tristate "Apple Macintosh file system support" + depends on BLOCK select NLS help If you say Y here, you will be able to mount Macintosh-formatted diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 422dde2ec0a1..5f7f1abd5f6d 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -51,7 +51,7 @@ done: */ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; int len, err; char strbuf[HFS_MAX_NAMELEN]; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index d47f11658c17..3031dfdd2358 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -128,7 +128,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; + struct inode *inode = file_inode(file)->i_mapping->host; ssize_t ret; ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, diff --git a/fs/hfs/super.c b/fs/hfs/super.c index e93ddaadfd1e..bbaaa8a4ee64 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -466,6 +466,7 @@ static struct file_system_type hfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("hfs"); static void hfs_init_once(void *p) { diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile index 3cc0df730156..09d278bb7b91 100644 --- a/fs/hfsplus/Makefile +++ b/fs/hfsplus/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \ - bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o - + bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \ + attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c new file mode 100644 index 000000000000..8d691f124714 --- /dev/null +++ b/fs/hfsplus/attributes.c @@ -0,0 +1,399 @@ +/* + * linux/fs/hfsplus/attributes.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handling of records in attributes tree + */ + +#include "hfsplus_fs.h" +#include "hfsplus_raw.h" + +static struct kmem_cache *hfsplus_attr_tree_cachep; + +int hfsplus_create_attr_tree_cache(void) +{ + if (hfsplus_attr_tree_cachep) + return -EEXIST; + + hfsplus_attr_tree_cachep = + kmem_cache_create("hfsplus_attr_cache", + sizeof(hfsplus_attr_entry), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!hfsplus_attr_tree_cachep) + return -ENOMEM; + + return 0; +} + +void hfsplus_destroy_attr_tree_cache(void) +{ + kmem_cache_destroy(hfsplus_attr_tree_cachep); +} + +int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2) +{ + __be32 k1_cnid, k2_cnid; + + k1_cnid = k1->attr.cnid; + k2_cnid = k2->attr.cnid; + if (k1_cnid != k2_cnid) + return be32_to_cpu(k1_cnid) < be32_to_cpu(k2_cnid) ? -1 : 1; + + return hfsplus_strcmp( + (const struct hfsplus_unistr *)&k1->attr.key_name, + (const struct hfsplus_unistr *)&k2->attr.key_name); +} + +int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, + u32 cnid, const char *name) +{ + int len; + + memset(key, 0, sizeof(struct hfsplus_attr_key)); + key->attr.cnid = cpu_to_be32(cnid); + if (name) { + len = strlen(name); + if (len > HFSPLUS_ATTR_MAX_STRLEN) { + printk(KERN_ERR "hfs: invalid xattr name's length\n"); + return -EINVAL; + } + hfsplus_asc2uni(sb, + (struct hfsplus_unistr *)&key->attr.key_name, + HFSPLUS_ATTR_MAX_STRLEN, name, len); + len = be16_to_cpu(key->attr.key_name.length); + } else { + key->attr.key_name.length = 0; + len = 0; + } + + /* The length of the key, as stored in key_len field, does not include + * the size of the key_len field itself. + * So, offsetof(hfsplus_attr_key, key_name) is a trick because + * it takes into consideration key_len field (__be16) of + * hfsplus_attr_key structure instead of length field (__be16) of + * hfsplus_attr_unistr structure. + */ + key->key_len = + cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) + + 2 * len); + + return 0; +} + +void hfsplus_attr_build_key_uni(hfsplus_btree_key *key, + u32 cnid, + struct hfsplus_attr_unistr *name) +{ + int ustrlen; + + memset(key, 0, sizeof(struct hfsplus_attr_key)); + ustrlen = be16_to_cpu(name->length); + key->attr.cnid = cpu_to_be32(cnid); + key->attr.key_name.length = cpu_to_be16(ustrlen); + ustrlen *= 2; + memcpy(key->attr.key_name.unicode, name->unicode, ustrlen); + + /* The length of the key, as stored in key_len field, does not include + * the size of the key_len field itself. + * So, offsetof(hfsplus_attr_key, key_name) is a trick because + * it takes into consideration key_len field (__be16) of + * hfsplus_attr_key structure instead of length field (__be16) of + * hfsplus_attr_unistr structure. + */ + key->key_len = + cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) + + ustrlen); +} + +hfsplus_attr_entry *hfsplus_alloc_attr_entry(void) +{ + return kmem_cache_alloc(hfsplus_attr_tree_cachep, GFP_KERNEL); +} + +void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry) +{ + if (entry) + kmem_cache_free(hfsplus_attr_tree_cachep, entry); +} + +#define HFSPLUS_INVALID_ATTR_RECORD -1 + +static int hfsplus_attr_build_record(hfsplus_attr_entry *entry, int record_type, + u32 cnid, const void *value, size_t size) +{ + if (record_type == HFSPLUS_ATTR_FORK_DATA) { + /* + * Mac OS X supports only inline data attributes. + * Do nothing + */ + memset(entry, 0, sizeof(*entry)); + return sizeof(struct hfsplus_attr_fork_data); + } else if (record_type == HFSPLUS_ATTR_EXTENTS) { + /* + * Mac OS X supports only inline data attributes. + * Do nothing. + */ + memset(entry, 0, sizeof(*entry)); + return sizeof(struct hfsplus_attr_extents); + } else if (record_type == HFSPLUS_ATTR_INLINE_DATA) { + u16 len; + + memset(entry, 0, sizeof(struct hfsplus_attr_inline_data)); + entry->inline_data.record_type = cpu_to_be32(record_type); + if (size <= HFSPLUS_MAX_INLINE_DATA_SIZE) + len = size; + else + return HFSPLUS_INVALID_ATTR_RECORD; + entry->inline_data.length = cpu_to_be16(len); + memcpy(entry->inline_data.raw_bytes, value, len); + /* + * Align len on two-byte boundary. + * It needs to add pad byte if we have odd len. + */ + len = round_up(len, 2); + return offsetof(struct hfsplus_attr_inline_data, raw_bytes) + + len; + } else /* invalid input */ + memset(entry, 0, sizeof(*entry)); + + return HFSPLUS_INVALID_ATTR_RECORD; +} + +int hfsplus_find_attr(struct super_block *sb, u32 cnid, + const char *name, struct hfs_find_data *fd) +{ + int err = 0; + + dprint(DBG_ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); + + if (!HFSPLUS_SB(sb)->attr_tree) { + printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + return -EINVAL; + } + + if (name) { + err = hfsplus_attr_build_key(sb, fd->search_key, cnid, name); + if (err) + goto failed_find_attr; + err = hfs_brec_find(fd, hfs_find_rec_by_key); + if (err) + goto failed_find_attr; + } else { + err = hfsplus_attr_build_key(sb, fd->search_key, cnid, NULL); + if (err) + goto failed_find_attr; + err = hfs_brec_find(fd, hfs_find_1st_rec_by_cnid); + if (err) + goto failed_find_attr; + } + +failed_find_attr: + return err; +} + +int hfsplus_attr_exists(struct inode *inode, const char *name) +{ + int err = 0; + struct super_block *sb = inode->i_sb; + struct hfs_find_data fd; + + if (!HFSPLUS_SB(sb)->attr_tree) + return 0; + + err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); + if (err) + return 0; + + err = hfsplus_find_attr(sb, inode->i_ino, name, &fd); + if (err) + goto attr_not_found; + + hfs_find_exit(&fd); + return 1; + +attr_not_found: + hfs_find_exit(&fd); + return 0; +} + +int hfsplus_create_attr(struct inode *inode, + const char *name, + const void *value, size_t size) +{ + struct super_block *sb = inode->i_sb; + struct hfs_find_data fd; + hfsplus_attr_entry *entry_ptr; + int entry_size; + int err; + + dprint(DBG_ATTR_MOD, "create_attr: %s,%ld\n", + name ? name : NULL, inode->i_ino); + + if (!HFSPLUS_SB(sb)->attr_tree) { + printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + return -EINVAL; + } + + entry_ptr = hfsplus_alloc_attr_entry(); + if (!entry_ptr) + return -ENOMEM; + + err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); + if (err) + goto failed_init_create_attr; + + if (name) { + err = hfsplus_attr_build_key(sb, fd.search_key, + inode->i_ino, name); + if (err) + goto failed_create_attr; + } else { + err = -EINVAL; + goto failed_create_attr; + } + + /* Mac OS X supports only inline data attributes. */ + entry_size = hfsplus_attr_build_record(entry_ptr, + HFSPLUS_ATTR_INLINE_DATA, + inode->i_ino, + value, size); + if (entry_size == HFSPLUS_INVALID_ATTR_RECORD) { + err = -EINVAL; + goto failed_create_attr; + } + + err = hfs_brec_find(&fd, hfs_find_rec_by_key); + if (err != -ENOENT) { + if (!err) + err = -EEXIST; + goto failed_create_attr; + } + + err = hfs_brec_insert(&fd, entry_ptr, entry_size); + if (err) + goto failed_create_attr; + + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY); + +failed_create_attr: + hfs_find_exit(&fd); + +failed_init_create_attr: + hfsplus_destroy_attr_entry(entry_ptr); + return err; +} + +static int __hfsplus_delete_attr(struct inode *inode, u32 cnid, + struct hfs_find_data *fd) +{ + int err = 0; + __be32 found_cnid, record_type; + + hfs_bnode_read(fd->bnode, &found_cnid, + fd->keyoffset + + offsetof(struct hfsplus_attr_key, cnid), + sizeof(__be32)); + if (cnid != be32_to_cpu(found_cnid)) + return -ENOENT; + + hfs_bnode_read(fd->bnode, &record_type, + fd->entryoffset, sizeof(record_type)); + + switch (be32_to_cpu(record_type)) { + case HFSPLUS_ATTR_INLINE_DATA: + /* All is OK. Do nothing. */ + break; + case HFSPLUS_ATTR_FORK_DATA: + case HFSPLUS_ATTR_EXTENTS: + printk(KERN_ERR "hfs: only inline data xattr are supported\n"); + return -EOPNOTSUPP; + default: + printk(KERN_ERR "hfs: invalid extended attribute record\n"); + return -ENOENT; + } + + err = hfs_brec_remove(fd); + if (err) + return err; + + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY); + return err; +} + +int hfsplus_delete_attr(struct inode *inode, const char *name) +{ + int err = 0; + struct super_block *sb = inode->i_sb; + struct hfs_find_data fd; + + dprint(DBG_ATTR_MOD, "delete_attr: %s,%ld\n", + name ? name : NULL, inode->i_ino); + + if (!HFSPLUS_SB(sb)->attr_tree) { + printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + return -EINVAL; + } + + err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd); + if (err) + return err; + + if (name) { + err = hfsplus_attr_build_key(sb, fd.search_key, + inode->i_ino, name); + if (err) + goto out; + } else { + printk(KERN_ERR "hfs: invalid extended attribute name\n"); + err = -EINVAL; + goto out; + } + + err = hfs_brec_find(&fd, hfs_find_rec_by_key); + if (err) + goto out; + + err = __hfsplus_delete_attr(inode, inode->i_ino, &fd); + if (err) + goto out; + +out: + hfs_find_exit(&fd); + return err; +} + +int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) +{ + int err = 0; + struct hfs_find_data fd; + + dprint(DBG_ATTR_MOD, "delete_all_attrs: %d\n", cnid); + + if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { + printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + return -EINVAL; + } + + err = hfs_find_init(HFSPLUS_SB(dir->i_sb)->attr_tree, &fd); + if (err) + return err; + + for (;;) { + err = hfsplus_find_attr(dir->i_sb, cnid, NULL, &fd); + if (err) { + if (err != -ENOENT) + printk(KERN_ERR "hfs: xattr search failed.\n"); + goto end_delete_all; + } + + err = __hfsplus_delete_attr(dir, cnid, &fd); + if (err) + goto end_delete_all; + } + +end_delete_all: + hfs_find_exit(&fd); + return err; +} diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 5d799c13205f..d73c98d1ee99 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -24,7 +24,19 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->key = ptr + tree->max_key_len + 2; dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); - mutex_lock(&tree->tree_lock); + switch (tree->cnid) { + case HFSPLUS_CAT_CNID: + mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX); + break; + case HFSPLUS_EXT_CNID: + mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX); + break; + case HFSPLUS_ATTR_CNID: + mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX); + break; + default: + BUG(); + } return 0; } @@ -38,15 +50,73 @@ void hfs_find_exit(struct hfs_find_data *fd) fd->tree = NULL; } -/* Find the record in bnode that best matches key (not greater than...)*/ -int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) +int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, + struct hfs_find_data *fd, + int *begin, + int *end, + int *cur_rec) +{ + __be32 cur_cnid, search_cnid; + + if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { + cur_cnid = fd->key->ext.cnid; + search_cnid = fd->search_key->ext.cnid; + } else if (bnode->tree->cnid == HFSPLUS_CAT_CNID) { + cur_cnid = fd->key->cat.parent; + search_cnid = fd->search_key->cat.parent; + } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { + cur_cnid = fd->key->attr.cnid; + search_cnid = fd->search_key->attr.cnid; + } else + BUG(); + + if (cur_cnid == search_cnid) { + (*end) = (*cur_rec); + if ((*begin) == (*end)) + return 1; + } else { + if (be32_to_cpu(cur_cnid) < be32_to_cpu(search_cnid)) + (*begin) = (*cur_rec) + 1; + else + (*end) = (*cur_rec) - 1; + } + + return 0; +} + +int hfs_find_rec_by_key(struct hfs_bnode *bnode, + struct hfs_find_data *fd, + int *begin, + int *end, + int *cur_rec) { int cmpval; + + cmpval = bnode->tree->keycmp(fd->key, fd->search_key); + if (!cmpval) { + (*end) = (*cur_rec); + return 1; + } + if (cmpval < 0) + (*begin) = (*cur_rec) + 1; + else + *(end) = (*cur_rec) - 1; + + return 0; +} + +/* Find the record in bnode that best matches key (not greater than...)*/ +int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, + search_strategy_t rec_found) +{ u16 off, len, keylen; int rec; int b, e; int res; + if (!rec_found) + BUG(); + b = 0; e = bnode->num_recs - 1; res = -ENOENT; @@ -59,17 +129,12 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) goto fail; } hfs_bnode_read(bnode, fd->key, off, keylen); - cmpval = bnode->tree->keycmp(fd->key, fd->search_key); - if (!cmpval) { - e = rec; + if (rec_found(bnode, fd, &b, &e, &rec)) { res = 0; goto done; } - if (cmpval < 0) - b = rec + 1; - else - e = rec - 1; } while (b <= e); + if (rec != e && e >= 0) { len = hfs_brec_lenoff(bnode, e, &off); keylen = hfs_brec_keylen(bnode, e); @@ -79,19 +144,21 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) } hfs_bnode_read(bnode, fd->key, off, keylen); } + done: fd->record = e; fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; + fail: return res; } /* Traverse a B*Tree from the root to a leaf finding best fit to key */ /* Return allocated copy of node found, set recnum to best record */ -int hfs_brec_find(struct hfs_find_data *fd) +int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) { struct hfs_btree *tree; struct hfs_bnode *bnode; @@ -122,7 +189,7 @@ int hfs_brec_find(struct hfs_find_data *fd) goto invalid; bnode->parent = parent; - res = __hfs_brec_find(bnode, fd); + res = __hfs_brec_find(bnode, fd, do_key_compare); if (!height) break; if (fd->record < 0) @@ -149,7 +216,7 @@ int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len) { int res; - res = hfs_brec_find(fd); + res = hfs_brec_find(fd, hfs_find_rec_by_key); if (res) return res; if (fd->entrylength > rec_len) diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 1c42cc5b899f..f31ac6f404f1 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -62,7 +62,8 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) tree = node->tree; if (node->type == HFS_NODE_LEAF || - tree->attributes & HFS_TREE_VARIDXKEYS) + tree->attributes & HFS_TREE_VARIDXKEYS || + node->tree->cnid == HFSPLUS_ATTR_CNID) key_len = hfs_bnode_read_u16(node, off) + 2; else key_len = tree->max_key_len + 2; @@ -314,7 +315,8 @@ void hfs_bnode_dump(struct hfs_bnode *node) if (i && node->type == HFS_NODE_INDEX) { int tmp; - if (node->tree->attributes & HFS_TREE_VARIDXKEYS) + if (node->tree->attributes & HFS_TREE_VARIDXKEYS || + node->tree->cnid == HFSPLUS_ATTR_CNID) tmp = hfs_bnode_read_u16(node, key_off) + 2; else tmp = node->tree->max_key_len + 2; @@ -646,6 +648,8 @@ void hfs_bnode_put(struct hfs_bnode *node) if (test_bit(HFS_BNODE_DELETED, &node->flags)) { hfs_bnode_unhash(node); spin_unlock(&tree->hash_lock); + hfs_bnode_clear(node, 0, + PAGE_CACHE_SIZE * tree->pages_per_bnode); hfs_bmap_free(node); hfs_bnode_free(node); return; diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 2a734cfccc92..298d4e45604b 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -36,7 +36,8 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) return 0; if ((node->type == HFS_NODE_INDEX) && - !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) { + !(node->tree->attributes & HFS_TREE_VARIDXKEYS) && + (node->tree->cnid != HFSPLUS_ATTR_CNID)) { retval = node->tree->max_key_len + 2; } else { recoff = hfs_bnode_read_u16(node, @@ -151,12 +152,13 @@ skip: /* get index key */ hfs_bnode_read_key(new_node, fd->search_key, 14); - __hfs_brec_find(fd->bnode, fd); + __hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key); hfs_bnode_put(new_node); new_node = NULL; - if (tree->attributes & HFS_TREE_VARIDXKEYS) + if ((tree->attributes & HFS_TREE_VARIDXKEYS) || + (tree->cnid == HFSPLUS_ATTR_CNID)) key_len = be16_to_cpu(fd->search_key->key_len) + 2; else { fd->search_key->key_len = @@ -201,7 +203,7 @@ again: hfs_bnode_put(node); node = fd->bnode = parent; - __hfs_brec_find(node, fd); + __hfs_brec_find(node, fd, hfs_find_rec_by_key); goto again; } hfs_bnode_write_u16(node, @@ -367,12 +369,13 @@ again: parent = hfs_bnode_find(tree, node->parent); if (IS_ERR(parent)) return PTR_ERR(parent); - __hfs_brec_find(parent, fd); + __hfs_brec_find(parent, fd, hfs_find_rec_by_key); hfs_bnode_dump(parent); rec = fd->record; /* size difference between old and new key */ - if (tree->attributes & HFS_TREE_VARIDXKEYS) + if ((tree->attributes & HFS_TREE_VARIDXKEYS) || + (tree->cnid == HFSPLUS_ATTR_CNID)) newkeylen = hfs_bnode_read_u16(node, 14) + 2; else fd->keylength = newkeylen = tree->max_key_len + 2; @@ -427,7 +430,7 @@ skip: hfs_bnode_read_key(new_node, fd->search_key, 14); cnid = cpu_to_be32(new_node->this); - __hfs_brec_find(fd->bnode, fd); + __hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key); hfs_brec_insert(fd, &cnid, sizeof(cnid)); hfs_bnode_put(fd->bnode); hfs_bnode_put(new_node); @@ -495,13 +498,15 @@ static int hfs_btree_inc_height(struct hfs_btree *tree) /* insert old root idx into new root */ node->parent = tree->root; if (node->type == HFS_NODE_LEAF || - tree->attributes & HFS_TREE_VARIDXKEYS) + tree->attributes & HFS_TREE_VARIDXKEYS || + tree->cnid == HFSPLUS_ATTR_CNID) key_size = hfs_bnode_read_u16(node, 14) + 2; else key_size = tree->max_key_len + 2; hfs_bnode_copy(new_node, 14, node, 14, key_size); - if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { + if (!(tree->attributes & HFS_TREE_VARIDXKEYS) && + (tree->cnid != HFSPLUS_ATTR_CNID)) { key_size = tree->max_key_len + 2; hfs_bnode_write_u16(new_node, 14, tree->max_key_len); } diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 685d07d0ed18..efb689c21a95 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -98,6 +98,14 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); } break; + case HFSPLUS_ATTR_CNID: + if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) { + printk(KERN_ERR "hfs: invalid attributes max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + tree->keycmp = hfsplus_attr_bin_cmp_key; + break; default: printk(KERN_ERR "hfs: unknown B*Tree requested\n"); goto fail_page; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 798d9c4c5e71..840d71edd193 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -45,7 +45,8 @@ void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, key->cat.parent = cpu_to_be32(parent); if (str) { - hfsplus_asc2uni(sb, &key->cat.name, str->name, str->len); + hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN, + str->name, str->len); len = be16_to_cpu(key->cat.name.length); } else { key->cat.name.length = 0; @@ -167,7 +168,8 @@ static int hfsplus_fill_cat_thread(struct super_block *sb, entry->type = cpu_to_be16(type); entry->thread.reserved = 0; entry->thread.parentID = cpu_to_be32(parentid); - hfsplus_asc2uni(sb, &entry->thread.nodeName, str->name, str->len); + hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN, + str->name, str->len); return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2; } @@ -198,7 +200,7 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID), &tmp.thread.nodeName); - return hfs_brec_find(fd); + return hfs_brec_find(fd, hfs_find_rec_by_key); } int hfsplus_create_cat(u32 cnid, struct inode *dir, @@ -221,7 +223,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, S_ISDIR(inode->i_mode) ? HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD, dir->i_ino, str); - err = hfs_brec_find(&fd); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) err = -EEXIST; @@ -233,7 +235,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); entry_size = hfsplus_cat_build_record(&entry, cnid, inode); - err = hfs_brec_find(&fd); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err != -ENOENT) { /* panic? */ if (!err) @@ -253,7 +255,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, err1: hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); - if (!hfs_brec_find(&fd)) + if (!hfs_brec_find(&fd, hfs_find_rec_by_key)) hfs_brec_remove(&fd); err2: hfs_find_exit(&fd); @@ -279,7 +281,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) int len; hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); - err = hfs_brec_find(&fd); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -296,7 +298,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) } else hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); - err = hfs_brec_find(&fd); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -326,7 +328,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) goto out; hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); - err = hfs_brec_find(&fd); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -337,6 +339,12 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) dir->i_size--; dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); + + if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) { + if (HFSPLUS_SB(sb)->attr_tree) + hfsplus_delete_all_attrs(dir, cnid); + } + out: hfs_find_exit(&fd); @@ -363,7 +371,7 @@ int hfsplus_rename_cat(u32 cnid, /* find the old dir entry and read the data */ hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); - err = hfs_brec_find(&src_fd); + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) { @@ -376,7 +384,7 @@ int hfsplus_rename_cat(u32 cnid, /* create new dir entry with the data from the old entry */ hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name); - err = hfs_brec_find(&dst_fd); + err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) err = -EEXIST; @@ -391,7 +399,7 @@ int hfsplus_rename_cat(u32 cnid, /* finally remove the old entry */ hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); - err = hfs_brec_find(&src_fd); + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; err = hfs_brec_remove(&src_fd); @@ -402,7 +410,7 @@ int hfsplus_rename_cat(u32 cnid, /* remove old thread entry */ hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL); - err = hfs_brec_find(&src_fd); + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; type = hfs_bnode_read_u16(src_fd.bnode, src_fd.entryoffset); @@ -414,7 +422,7 @@ int hfsplus_rename_cat(u32 cnid, hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL); entry_size = hfsplus_fill_cat_thread(sb, &entry, type, dst_dir->i_ino, dst_name); - err = hfs_brec_find(&dst_fd); + err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) err = -EEXIST; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 6b9f921ef2fa..031c24e50521 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -15,6 +15,7 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" +#include "xattr.h" static inline void hfsplus_instantiate(struct dentry *dentry, struct inode *inode, u32 cnid) @@ -122,7 +123,7 @@ fail: static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; int len, err; char strbuf[HFSPLUS_MAX_STRLEN + 1]; @@ -138,7 +139,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) if (err) return err; hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); - err = hfs_brec_find(&fd); + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -421,6 +422,15 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, if (res) goto out_err; + res = hfsplus_init_inode_security(inode, dir, &dentry->d_name); + if (res == -EOPNOTSUPP) + res = 0; /* Operation is not supported. */ + else if (res) { + /* Try to delete anyway without error analysis. */ + hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); + goto out_err; + } + hfsplus_instantiate(dentry, inode, inode->i_ino); mark_inode_dirty(inode); goto out; @@ -450,15 +460,26 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, mode, rdev); res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); - if (res) { - clear_nlink(inode); - hfsplus_delete_inode(inode); - iput(inode); - goto out; + if (res) + goto failed_mknod; + + res = hfsplus_init_inode_security(inode, dir, &dentry->d_name); + if (res == -EOPNOTSUPP) + res = 0; /* Operation is not supported. */ + else if (res) { + /* Try to delete anyway without error analysis. */ + hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); + goto failed_mknod; } hfsplus_instantiate(dentry, inode, inode->i_ino); mark_inode_dirty(inode); + goto out; + +failed_mknod: + clear_nlink(inode); + hfsplus_delete_inode(inode); + iput(inode); out: mutex_unlock(&sbi->vh_mutex); return res; @@ -499,15 +520,19 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, } const struct inode_operations hfsplus_dir_inode_operations = { - .lookup = hfsplus_lookup, - .create = hfsplus_create, - .link = hfsplus_link, - .unlink = hfsplus_unlink, - .mkdir = hfsplus_mkdir, - .rmdir = hfsplus_rmdir, - .symlink = hfsplus_symlink, - .mknod = hfsplus_mknod, - .rename = hfsplus_rename, + .lookup = hfsplus_lookup, + .create = hfsplus_create, + .link = hfsplus_link, + .unlink = hfsplus_unlink, + .mkdir = hfsplus_mkdir, + .rmdir = hfsplus_rmdir, + .symlink = hfsplus_symlink, + .mknod = hfsplus_mknod, + .rename = hfsplus_rename, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = hfsplus_listxattr, + .removexattr = hfsplus_removexattr, }; const struct file_operations hfsplus_dir_operations = { diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index eba76eab6d62..a94f0f779d5e 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -95,7 +95,7 @@ static void __hfsplus_ext_write_extent(struct inode *inode, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); - res = hfs_brec_find(fd); + res = hfs_brec_find(fd, hfs_find_rec_by_key); if (hip->extent_state & HFSPLUS_EXT_NEW) { if (res != -ENOENT) return; @@ -154,7 +154,7 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, hfsplus_ext_build_key(fd->search_key, cnid, block, type); fd->key->ext.cnid = 0; - res = hfs_brec_find(fd); + res = hfs_brec_find(fd, hfs_find_rec_by_key); if (res && res != -ENOENT) return res; if (fd->key->ext.cnid != fd->search_key->ext.cnid || diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a6da86b1b4c1..05b11f36024c 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -23,6 +23,7 @@ #define DBG_SUPER 0x00000010 #define DBG_EXTENT 0x00000020 #define DBG_BITMAP 0x00000040 +#define DBG_ATTR_MOD 0x00000080 #if 0 #define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) @@ -46,6 +47,13 @@ typedef int (*btree_keycmp)(const hfsplus_btree_key *, #define NODE_HASH_SIZE 256 +/* B-tree mutex nested subclasses */ +enum hfsplus_btree_mutex_classes { + CATALOG_BTREE_MUTEX, + EXTENTS_BTREE_MUTEX, + ATTR_BTREE_MUTEX, +}; + /* An HFS+ BTree held in memory */ struct hfs_btree { struct super_block *sb; @@ -223,6 +231,7 @@ struct hfsplus_inode_info { #define HFSPLUS_I_CAT_DIRTY 1 /* has changes in the catalog tree */ #define HFSPLUS_I_EXT_DIRTY 2 /* has changes in the extent tree */ #define HFSPLUS_I_ALLOC_DIRTY 3 /* has changes in the allocation file */ +#define HFSPLUS_I_ATTR_DIRTY 4 /* has changes in the attributes tree */ #define HFSPLUS_IS_RSRC(inode) \ test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags) @@ -302,7 +311,7 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) #define hfs_brec_remove hfsplus_brec_remove #define hfs_find_init hfsplus_find_init #define hfs_find_exit hfsplus_find_exit -#define __hfs_brec_find __hplusfs_brec_find +#define __hfs_brec_find __hfsplus_brec_find #define hfs_brec_find hfsplus_brec_find #define hfs_brec_read hfsplus_brec_read #define hfs_brec_goto hfsplus_brec_goto @@ -324,10 +333,33 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) */ #define HFSPLUS_IOC_BLESS _IO('h', 0x80) +typedef int (*search_strategy_t)(struct hfs_bnode *, + struct hfs_find_data *, + int *, int *, int *); + /* * Functions in any *.c used in other files */ +/* attributes.c */ +int hfsplus_create_attr_tree_cache(void); +void hfsplus_destroy_attr_tree_cache(void); +hfsplus_attr_entry *hfsplus_alloc_attr_entry(void); +void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p); +int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *, + const hfsplus_btree_key *); +int hfsplus_attr_build_key(struct super_block *, hfsplus_btree_key *, + u32, const char *); +void hfsplus_attr_build_key_uni(hfsplus_btree_key *key, + u32 cnid, + struct hfsplus_attr_unistr *name); +int hfsplus_find_attr(struct super_block *, u32, + const char *, struct hfs_find_data *); +int hfsplus_attr_exists(struct inode *inode, const char *name); +int hfsplus_create_attr(struct inode *, const char *, const void *, size_t); +int hfsplus_delete_attr(struct inode *, const char *); +int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid); + /* bitmap.c */ int hfsplus_block_allocate(struct super_block *, u32, u32, u32 *); int hfsplus_block_free(struct super_block *, u32, u32); @@ -369,8 +401,15 @@ int hfs_brec_remove(struct hfs_find_data *); /* bfind.c */ int hfs_find_init(struct hfs_btree *, struct hfs_find_data *); void hfs_find_exit(struct hfs_find_data *); -int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *); -int hfs_brec_find(struct hfs_find_data *); +int hfs_find_1st_rec_by_cnid(struct hfs_bnode *, + struct hfs_find_data *, + int *, int *, int *); +int hfs_find_rec_by_key(struct hfs_bnode *, + struct hfs_find_data *, + int *, int *, int *); +int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *, + search_strategy_t); +int hfs_brec_find(struct hfs_find_data *, search_strategy_t); int hfs_brec_read(struct hfs_find_data *, void *, int); int hfs_brec_goto(struct hfs_find_data *, int); @@ -417,11 +456,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, /* ioctl.c */ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); -int hfsplus_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); -ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, - void *value, size_t size); -ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); /* options.c */ int hfsplus_parse_options(char *, struct hfsplus_sb_info *); @@ -446,7 +480,7 @@ int hfsplus_strcmp(const struct hfsplus_unistr *, int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); int hfsplus_asc2uni(struct super_block *, - struct hfsplus_unistr *, const char *, int); + struct hfsplus_unistr *, int, const char *, int); int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, struct qstr *str); int hfsplus_compare_dentry(const struct dentry *parent, diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 921967e5abb1..452ede01b036 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -52,13 +52,23 @@ typedef __be32 hfsplus_cnid; typedef __be16 hfsplus_unichr; +#define HFSPLUS_MAX_STRLEN 255 +#define HFSPLUS_ATTR_MAX_STRLEN 127 + /* A "string" as used in filenames, etc. */ struct hfsplus_unistr { __be16 length; - hfsplus_unichr unicode[255]; + hfsplus_unichr unicode[HFSPLUS_MAX_STRLEN]; } __packed; -#define HFSPLUS_MAX_STRLEN 255 +/* + * A "string" is used in attributes file + * for name of extended attribute + */ +struct hfsplus_attr_unistr { + __be16 length; + hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN]; +} __packed; /* POSIX permissions */ struct hfsplus_perm { @@ -291,6 +301,8 @@ struct hfsplus_cat_file { /* File attribute bits */ #define HFSPLUS_FILE_LOCKED 0x0001 #define HFSPLUS_FILE_THREAD_EXISTS 0x0002 +#define HFSPLUS_XATTR_EXISTS 0x0004 +#define HFSPLUS_ACL_EXISTS 0x0008 /* HFS+ catalog thread (part of a cat_entry) */ struct hfsplus_cat_thread { @@ -327,11 +339,63 @@ struct hfsplus_ext_key { #define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key) +#define HFSPLUS_XATTR_FINDER_INFO_NAME "com.apple.FinderInfo" +#define HFSPLUS_XATTR_ACL_NAME "com.apple.system.Security" + +#define HFSPLUS_ATTR_INLINE_DATA 0x10 +#define HFSPLUS_ATTR_FORK_DATA 0x20 +#define HFSPLUS_ATTR_EXTENTS 0x30 + +/* HFS+ attributes tree key */ +struct hfsplus_attr_key { + __be16 key_len; + __be16 pad; + hfsplus_cnid cnid; + __be32 start_block; + struct hfsplus_attr_unistr key_name; +} __packed; + +#define HFSPLUS_ATTR_KEYLEN sizeof(struct hfsplus_attr_key) + +/* HFS+ fork data attribute */ +struct hfsplus_attr_fork_data { + __be32 record_type; + __be32 reserved; + struct hfsplus_fork_raw the_fork; +} __packed; + +/* HFS+ extension attribute */ +struct hfsplus_attr_extents { + __be32 record_type; + __be32 reserved; + struct hfsplus_extent extents; +} __packed; + +#define HFSPLUS_MAX_INLINE_DATA_SIZE 3802 + +/* HFS+ attribute inline data */ +struct hfsplus_attr_inline_data { + __be32 record_type; + __be32 reserved1; + u8 reserved2[6]; + __be16 length; + u8 raw_bytes[HFSPLUS_MAX_INLINE_DATA_SIZE]; +} __packed; + +/* A data record in the attributes tree */ +typedef union { + __be32 record_type; + struct hfsplus_attr_fork_data fork_data; + struct hfsplus_attr_extents extents; + struct hfsplus_attr_inline_data inline_data; +} __packed hfsplus_attr_entry; + /* HFS+ generic BTree key */ typedef union { __be16 key_len; struct hfsplus_cat_key cat; struct hfsplus_ext_key ext; + struct hfsplus_attr_key attr; } __packed hfsplus_btree_key; #endif diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 799b336b59f9..160ccc9cdb4b 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -17,6 +17,7 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" +#include "xattr.h" static int hfsplus_readpage(struct file *file, struct page *page) { @@ -124,7 +125,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; + struct inode *inode = file_inode(file)->i_mapping->host; ssize_t ret; ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, @@ -348,6 +349,18 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, error = error2; } + if (test_and_clear_bit(HFSPLUS_I_ATTR_DIRTY, &hip->flags)) { + if (sbi->attr_tree) { + error2 = + filemap_write_and_wait( + sbi->attr_tree->inode->i_mapping); + if (!error) + error = error2; + } else { + printk(KERN_ERR "hfs: sync non-existent attributes tree\n"); + } + } + if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) { error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); if (!error) @@ -365,9 +378,10 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, static const struct inode_operations hfsplus_file_inode_operations = { .lookup = hfsplus_file_lookup, .setattr = hfsplus_setattr, - .setxattr = hfsplus_setxattr, - .getxattr = hfsplus_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = hfsplus_listxattr, + .removexattr = hfsplus_removexattr, }; static const struct file_operations hfsplus_file_operations = { diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 09addc8615fa..d3ff5cc317d7 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -16,7 +16,6 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/sched.h> -#include <linux/xattr.h> #include <asm/uaccess.h> #include "hfsplus_fs.h" @@ -59,7 +58,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); unsigned int flags = 0; @@ -75,7 +74,7 @@ static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags) static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); unsigned int flags; int err = 0; @@ -151,110 +150,3 @@ long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return -ENOTTY; } } - -int hfsplus_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) -{ - struct inode *inode = dentry->d_inode; - struct hfs_find_data fd; - hfsplus_cat_entry entry; - struct hfsplus_cat_file *file; - int res; - - if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) - return -EOPNOTSUPP; - - res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); - if (res) - return res; - res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); - if (res) - goto out; - hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, - sizeof(struct hfsplus_cat_file)); - file = &entry.file; - - if (!strcmp(name, "hfs.type")) { - if (size == 4) - memcpy(&file->user_info.fdType, value, 4); - else - res = -ERANGE; - } else if (!strcmp(name, "hfs.creator")) { - if (size == 4) - memcpy(&file->user_info.fdCreator, value, 4); - else - res = -ERANGE; - } else - res = -EOPNOTSUPP; - if (!res) { - hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, - sizeof(struct hfsplus_cat_file)); - hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); - } -out: - hfs_find_exit(&fd); - return res; -} - -ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, - void *value, size_t size) -{ - struct inode *inode = dentry->d_inode; - struct hfs_find_data fd; - hfsplus_cat_entry entry; - struct hfsplus_cat_file *file; - ssize_t res = 0; - - if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) - return -EOPNOTSUPP; - - if (size) { - res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); - if (res) - return res; - res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); - if (res) - goto out; - hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, - sizeof(struct hfsplus_cat_file)); - } - file = &entry.file; - - if (!strcmp(name, "hfs.type")) { - if (size >= 4) { - memcpy(value, &file->user_info.fdType, 4); - res = 4; - } else - res = size ? -ERANGE : 4; - } else if (!strcmp(name, "hfs.creator")) { - if (size >= 4) { - memcpy(value, &file->user_info.fdCreator, 4); - res = 4; - } else - res = size ? -ERANGE : 4; - } else - res = -EOPNOTSUPP; -out: - if (size) - hfs_find_exit(&fd); - return res; -} - -#define HFSPLUS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type")) - -ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct inode *inode = dentry->d_inode; - - if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) - return -EOPNOTSUPP; - - if (!buffer || !size) - return HFSPLUS_ATTRLIST_SIZE; - if (size < HFSPLUS_ATTRLIST_SIZE) - return -ERANGE; - strcpy(buffer, "hfs.type"); - strcpy(buffer + sizeof("hfs.type"), "hfs.creator"); - - return HFSPLUS_ATTRLIST_SIZE; -} diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 796198d26553..7b87284e46dc 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -20,6 +20,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb); static void hfsplus_destroy_inode(struct inode *inode); #include "hfsplus_fs.h" +#include "xattr.h" static int hfsplus_system_read_inode(struct inode *inode) { @@ -118,6 +119,7 @@ static int hfsplus_system_write_inode(struct inode *inode) case HFSPLUS_ATTR_CNID: fork = &vhdr->attr_file; tree = sbi->attr_tree; + break; default: return -EIO; } @@ -191,6 +193,12 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); if (!error) error = error2; + if (sbi->attr_tree) { + error2 = + filemap_write_and_wait(sbi->attr_tree->inode->i_mapping); + if (!error) + error = error2; + } error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); if (!error) error = error2; @@ -281,6 +289,7 @@ static void hfsplus_put_super(struct super_block *sb) hfsplus_sync_fs(sb, 1); } + hfs_btree_close(sbi->attr_tree); hfs_btree_close(sbi->cat_tree); hfs_btree_close(sbi->ext_tree); iput(sbi->alloc_file); @@ -477,12 +486,20 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) printk(KERN_ERR "hfs: failed to load catalog file\n"); goto out_close_ext_tree; } + if (vhdr->attr_file.total_blocks != 0) { + sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID); + if (!sbi->attr_tree) { + printk(KERN_ERR "hfs: failed to load attributes file\n"); + goto out_close_cat_tree; + } + } + sb->s_xattr = hfsplus_xattr_handlers; inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); if (IS_ERR(inode)) { printk(KERN_ERR "hfs: failed to load allocation file\n"); err = PTR_ERR(inode); - goto out_close_cat_tree; + goto out_close_attr_tree; } sbi->alloc_file = inode; @@ -542,10 +559,27 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) } err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str, sbi->hidden_dir); - mutex_unlock(&sbi->vh_mutex); - if (err) + if (err) { + mutex_unlock(&sbi->vh_mutex); + goto out_put_hidden_dir; + } + + err = hfsplus_init_inode_security(sbi->hidden_dir, + root, &str); + if (err == -EOPNOTSUPP) + err = 0; /* Operation is not supported. */ + else if (err) { + /* + * Try to delete anyway without + * error analysis. + */ + hfsplus_delete_cat(sbi->hidden_dir->i_ino, + root, &str); + mutex_unlock(&sbi->vh_mutex); goto out_put_hidden_dir; + } + mutex_unlock(&sbi->vh_mutex); hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY); } @@ -562,6 +596,8 @@ out_put_root: sb->s_root = NULL; out_put_alloc_file: iput(sbi->alloc_file); +out_close_attr_tree: + hfs_btree_close(sbi->attr_tree); out_close_cat_tree: hfs_btree_close(sbi->cat_tree); out_close_ext_tree: @@ -618,6 +654,7 @@ static struct file_system_type hfsplus_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("hfsplus"); static void hfsplus_init_once(void *p) { @@ -635,9 +672,20 @@ static int __init init_hfsplus_fs(void) hfsplus_init_once); if (!hfsplus_inode_cachep) return -ENOMEM; + err = hfsplus_create_attr_tree_cache(); + if (err) + goto destroy_inode_cache; err = register_filesystem(&hfsplus_fs_type); if (err) - kmem_cache_destroy(hfsplus_inode_cachep); + goto destroy_attr_tree_cache; + return 0; + +destroy_attr_tree_cache: + hfsplus_destroy_attr_tree_cache(); + +destroy_inode_cache: + kmem_cache_destroy(hfsplus_inode_cachep); + return err; } @@ -650,6 +698,7 @@ static void __exit exit_hfsplus_fs(void) * destroy cache. */ rcu_barrier(); + hfsplus_destroy_attr_tree_cache(); kmem_cache_destroy(hfsplus_inode_cachep); } diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index a32998f29f0b..2c2e47dcfdd8 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -295,7 +295,8 @@ static inline u16 *decompose_unichar(wchar_t uc, int *size) return hfsplus_decompose_table + (off / 4); } -int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, +int hfsplus_asc2uni(struct super_block *sb, + struct hfsplus_unistr *ustr, int max_unistr_len, const char *astr, int len) { int size, dsize, decompose; @@ -303,7 +304,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, wchar_t c; decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); - while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { + while (outlen < max_unistr_len && len > 0) { size = asc2unichar(sb, astr, len, &c); if (decompose) @@ -311,7 +312,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, else dstr = NULL; if (dstr) { - if (outlen + dsize > HFSPLUS_MAX_STRLEN) + if (outlen + dsize > max_unistr_len) break; do { ustr->unicode[outlen++] = cpu_to_be16(*dstr++); diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c new file mode 100644 index 000000000000..e8a4b0815c61 --- /dev/null +++ b/fs/hfsplus/xattr.c @@ -0,0 +1,709 @@ +/* + * linux/fs/hfsplus/xattr.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Logic of processing extended attributes + */ + +#include "hfsplus_fs.h" +#include "xattr.h" + +const struct xattr_handler *hfsplus_xattr_handlers[] = { + &hfsplus_xattr_osx_handler, + &hfsplus_xattr_user_handler, + &hfsplus_xattr_trusted_handler, + &hfsplus_xattr_security_handler, + NULL +}; + +static int strcmp_xattr_finder_info(const char *name) +{ + if (name) { + return strncmp(name, HFSPLUS_XATTR_FINDER_INFO_NAME, + sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME)); + } + return -1; +} + +static int strcmp_xattr_acl(const char *name) +{ + if (name) { + return strncmp(name, HFSPLUS_XATTR_ACL_NAME, + sizeof(HFSPLUS_XATTR_ACL_NAME)); + } + return -1; +} + +static inline int is_known_namespace(const char *name) +{ + if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + return false; + + return true; +} + +static int can_set_xattr(struct inode *inode, const char *name, + const void *value, size_t value_len) +{ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return -EOPNOTSUPP; /* TODO: implement ACL support */ + + if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) { + /* + * This makes sure that we aren't trying to set an + * attribute in a different namespace by prefixing it + * with "osx." + */ + if (is_known_namespace(name + XATTR_MAC_OSX_PREFIX_LEN)) + return -EOPNOTSUPP; + + return 0; + } + + /* + * Don't allow setting an attribute in an unknown namespace. + */ + if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + return -EOPNOTSUPP; + + return 0; +} + +int __hfsplus_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + int err = 0; + struct hfs_find_data cat_fd; + hfsplus_cat_entry entry; + u16 cat_entry_flags, cat_entry_type; + u16 folder_finderinfo_len = sizeof(struct DInfo) + + sizeof(struct DXInfo); + u16 file_finderinfo_len = sizeof(struct FInfo) + + sizeof(struct FXInfo); + + if ((!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) || + HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + err = can_set_xattr(inode, name, value, size); + if (err) + return err; + + if (strncmp(name, XATTR_MAC_OSX_PREFIX, + XATTR_MAC_OSX_PREFIX_LEN) == 0) + name += XATTR_MAC_OSX_PREFIX_LEN; + + if (value == NULL) { + value = ""; + size = 0; + } + + err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); + if (err) { + printk(KERN_ERR "hfs: can't init xattr find struct\n"); + return err; + } + + err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); + if (err) { + printk(KERN_ERR "hfs: catalog searching failed\n"); + goto end_setxattr; + } + + if (!strcmp_xattr_finder_info(name)) { + if (flags & XATTR_CREATE) { + printk(KERN_ERR "hfs: xattr exists yet\n"); + err = -EOPNOTSUPP; + goto end_setxattr; + } + hfs_bnode_read(cat_fd.bnode, &entry, cat_fd.entryoffset, + sizeof(hfsplus_cat_entry)); + if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) { + if (size == folder_finderinfo_len) { + memcpy(&entry.folder.user_info, value, + folder_finderinfo_len); + hfs_bnode_write(cat_fd.bnode, &entry, + cat_fd.entryoffset, + sizeof(struct hfsplus_cat_folder)); + hfsplus_mark_inode_dirty(inode, + HFSPLUS_I_CAT_DIRTY); + } else { + err = -ERANGE; + goto end_setxattr; + } + } else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) { + if (size == file_finderinfo_len) { + memcpy(&entry.file.user_info, value, + file_finderinfo_len); + hfs_bnode_write(cat_fd.bnode, &entry, + cat_fd.entryoffset, + sizeof(struct hfsplus_cat_file)); + hfsplus_mark_inode_dirty(inode, + HFSPLUS_I_CAT_DIRTY); + } else { + err = -ERANGE; + goto end_setxattr; + } + } else { + err = -EOPNOTSUPP; + goto end_setxattr; + } + goto end_setxattr; + } + + if (!HFSPLUS_SB(inode->i_sb)->attr_tree) { + err = -EOPNOTSUPP; + goto end_setxattr; + } + + if (hfsplus_attr_exists(inode, name)) { + if (flags & XATTR_CREATE) { + printk(KERN_ERR "hfs: xattr exists yet\n"); + err = -EOPNOTSUPP; + goto end_setxattr; + } + err = hfsplus_delete_attr(inode, name); + if (err) + goto end_setxattr; + err = hfsplus_create_attr(inode, name, value, size); + if (err) + goto end_setxattr; + } else { + if (flags & XATTR_REPLACE) { + printk(KERN_ERR "hfs: cannot replace xattr\n"); + err = -EOPNOTSUPP; + goto end_setxattr; + } + err = hfsplus_create_attr(inode, name, value, size); + if (err) + goto end_setxattr; + } + + cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset); + if (cat_entry_type == HFSPLUS_FOLDER) { + cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode, + cat_fd.entryoffset + + offsetof(struct hfsplus_cat_folder, flags)); + cat_entry_flags |= HFSPLUS_XATTR_EXISTS; + if (!strcmp_xattr_acl(name)) + cat_entry_flags |= HFSPLUS_ACL_EXISTS; + hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_folder, flags), + cat_entry_flags); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } else if (cat_entry_type == HFSPLUS_FILE) { + cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode, + cat_fd.entryoffset + + offsetof(struct hfsplus_cat_file, flags)); + cat_entry_flags |= HFSPLUS_XATTR_EXISTS; + if (!strcmp_xattr_acl(name)) + cat_entry_flags |= HFSPLUS_ACL_EXISTS; + hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_file, flags), + cat_entry_flags); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } else { + printk(KERN_ERR "hfs: invalid catalog entry type\n"); + err = -EIO; + goto end_setxattr; + } + +end_setxattr: + hfs_find_exit(&cat_fd); + return err; +} + +static inline int is_osx_xattr(const char *xattr_name) +{ + return !is_known_namespace(xattr_name); +} + +static int name_len(const char *xattr_name, int xattr_name_len) +{ + int len = xattr_name_len + 1; + + if (is_osx_xattr(xattr_name)) + len += XATTR_MAC_OSX_PREFIX_LEN; + + return len; +} + +static int copy_name(char *buffer, const char *xattr_name, int name_len) +{ + int len = name_len; + int offset = 0; + + if (is_osx_xattr(xattr_name)) { + strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN); + offset += XATTR_MAC_OSX_PREFIX_LEN; + len += XATTR_MAC_OSX_PREFIX_LEN; + } + + strncpy(buffer + offset, xattr_name, name_len); + memset(buffer + offset + name_len, 0, 1); + len += 1; + + return len; +} + +static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry, + void *value, size_t size) +{ + ssize_t res = 0; + struct inode *inode = dentry->d_inode; + struct hfs_find_data fd; + u16 entry_type; + u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo); + u16 file_rec_len = sizeof(struct FInfo) + sizeof(struct FXInfo); + u16 record_len = max(folder_rec_len, file_rec_len); + u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; + u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)]; + + if (size >= record_len) { + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + if (res) { + printk(KERN_ERR "hfs: can't init xattr find struct\n"); + return res; + } + res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (res) + goto end_getxattr_finder_info; + entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); + + if (entry_type == HFSPLUS_FOLDER) { + hfs_bnode_read(fd.bnode, folder_finder_info, + fd.entryoffset + + offsetof(struct hfsplus_cat_folder, user_info), + folder_rec_len); + memcpy(value, folder_finder_info, folder_rec_len); + res = folder_rec_len; + } else if (entry_type == HFSPLUS_FILE) { + hfs_bnode_read(fd.bnode, file_finder_info, + fd.entryoffset + + offsetof(struct hfsplus_cat_file, user_info), + file_rec_len); + memcpy(value, file_finder_info, file_rec_len); + res = file_rec_len; + } else { + res = -EOPNOTSUPP; + goto end_getxattr_finder_info; + } + } else + res = size ? -ERANGE : record_len; + +end_getxattr_finder_info: + if (size >= record_len) + hfs_find_exit(&fd); + return res; +} + +ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct hfs_find_data fd; + hfsplus_attr_entry *entry; + __be32 xattr_record_type; + u32 record_type; + u16 record_length = 0; + ssize_t res = 0; + + if ((!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) || + HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + if (strncmp(name, XATTR_MAC_OSX_PREFIX, + XATTR_MAC_OSX_PREFIX_LEN) == 0) { + /* skip "osx." prefix */ + name += XATTR_MAC_OSX_PREFIX_LEN; + /* + * Don't allow retrieving properly prefixed attributes + * by prepending them with "osx." + */ + if (is_known_namespace(name)) + return -EOPNOTSUPP; + } + + if (!strcmp_xattr_finder_info(name)) + return hfsplus_getxattr_finder_info(dentry, value, size); + + if (!HFSPLUS_SB(inode->i_sb)->attr_tree) + return -EOPNOTSUPP; + + entry = hfsplus_alloc_attr_entry(); + if (!entry) { + printk(KERN_ERR "hfs: can't allocate xattr entry\n"); + return -ENOMEM; + } + + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); + if (res) { + printk(KERN_ERR "hfs: can't init xattr find struct\n"); + goto failed_getxattr_init; + } + + res = hfsplus_find_attr(inode->i_sb, inode->i_ino, name, &fd); + if (res) { + if (res == -ENOENT) + res = -ENODATA; + else + printk(KERN_ERR "hfs: xattr searching failed\n"); + goto out; + } + + hfs_bnode_read(fd.bnode, &xattr_record_type, + fd.entryoffset, sizeof(xattr_record_type)); + record_type = be32_to_cpu(xattr_record_type); + if (record_type == HFSPLUS_ATTR_INLINE_DATA) { + record_length = hfs_bnode_read_u16(fd.bnode, + fd.entryoffset + + offsetof(struct hfsplus_attr_inline_data, + length)); + if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) { + printk(KERN_ERR "hfs: invalid xattr record size\n"); + res = -EIO; + goto out; + } + } else if (record_type == HFSPLUS_ATTR_FORK_DATA || + record_type == HFSPLUS_ATTR_EXTENTS) { + printk(KERN_ERR "hfs: only inline data xattr are supported\n"); + res = -EOPNOTSUPP; + goto out; + } else { + printk(KERN_ERR "hfs: invalid xattr record\n"); + res = -EIO; + goto out; + } + + if (size) { + hfs_bnode_read(fd.bnode, entry, fd.entryoffset, + offsetof(struct hfsplus_attr_inline_data, + raw_bytes) + record_length); + } + + if (size >= record_length) { + memcpy(value, entry->inline_data.raw_bytes, record_length); + res = record_length; + } else + res = size ? -ERANGE : record_length; + +out: + hfs_find_exit(&fd); + +failed_getxattr_init: + hfsplus_destroy_attr_entry(entry); + return res; +} + +static inline int can_list(const char *xattr_name) +{ + if (!xattr_name) + return 0; + + return strncmp(xattr_name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN) || + capable(CAP_SYS_ADMIN); +} + +static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, + char *buffer, size_t size) +{ + ssize_t res = 0; + struct inode *inode = dentry->d_inode; + struct hfs_find_data fd; + u16 entry_type; + u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; + u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)]; + unsigned long len, found_bit; + int xattr_name_len, symbols_count; + + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + if (res) { + printk(KERN_ERR "hfs: can't init xattr find struct\n"); + return res; + } + + res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (res) + goto end_listxattr_finder_info; + + entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); + if (entry_type == HFSPLUS_FOLDER) { + len = sizeof(struct DInfo) + sizeof(struct DXInfo); + hfs_bnode_read(fd.bnode, folder_finder_info, + fd.entryoffset + + offsetof(struct hfsplus_cat_folder, user_info), + len); + found_bit = find_first_bit((void *)folder_finder_info, len*8); + } else if (entry_type == HFSPLUS_FILE) { + len = sizeof(struct FInfo) + sizeof(struct FXInfo); + hfs_bnode_read(fd.bnode, file_finder_info, + fd.entryoffset + + offsetof(struct hfsplus_cat_file, user_info), + len); + found_bit = find_first_bit((void *)file_finder_info, len*8); + } else { + res = -EOPNOTSUPP; + goto end_listxattr_finder_info; + } + + if (found_bit >= (len*8)) + res = 0; + else { + symbols_count = sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME) - 1; + xattr_name_len = + name_len(HFSPLUS_XATTR_FINDER_INFO_NAME, symbols_count); + if (!buffer || !size) { + if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) + res = xattr_name_len; + } else if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) { + if (size < xattr_name_len) + res = -ERANGE; + else { + res = copy_name(buffer, + HFSPLUS_XATTR_FINDER_INFO_NAME, + symbols_count); + } + } + } + +end_listxattr_finder_info: + hfs_find_exit(&fd); + + return res; +} + +ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + ssize_t err; + ssize_t res = 0; + struct inode *inode = dentry->d_inode; + struct hfs_find_data fd; + u16 key_len = 0; + struct hfsplus_attr_key attr_key; + char strbuf[HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; + int xattr_name_len; + + if ((!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) || + HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + res = hfsplus_listxattr_finder_info(dentry, buffer, size); + if (res < 0) + return res; + else if (!HFSPLUS_SB(inode->i_sb)->attr_tree) + return (res == 0) ? -EOPNOTSUPP : res; + + err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); + if (err) { + printk(KERN_ERR "hfs: can't init xattr find struct\n"); + return err; + } + + err = hfsplus_find_attr(inode->i_sb, inode->i_ino, NULL, &fd); + if (err) { + if (err == -ENOENT) { + if (res == 0) + res = -ENODATA; + goto end_listxattr; + } else { + res = err; + goto end_listxattr; + } + } + + for (;;) { + key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset); + if (key_len == 0 || key_len > fd.tree->max_key_len) { + printk(KERN_ERR "hfs: invalid xattr key length: %d\n", + key_len); + res = -EIO; + goto end_listxattr; + } + + hfs_bnode_read(fd.bnode, &attr_key, + fd.keyoffset, key_len + sizeof(key_len)); + + if (be32_to_cpu(attr_key.cnid) != inode->i_ino) + goto end_listxattr; + + xattr_name_len = HFSPLUS_ATTR_MAX_STRLEN; + if (hfsplus_uni2asc(inode->i_sb, + (const struct hfsplus_unistr *)&fd.key->attr.key_name, + strbuf, &xattr_name_len)) { + printk(KERN_ERR "hfs: unicode conversion failed\n"); + res = -EIO; + goto end_listxattr; + } + + if (!buffer || !size) { + if (can_list(strbuf)) + res += name_len(strbuf, xattr_name_len); + } else if (can_list(strbuf)) { + if (size < (res + name_len(strbuf, xattr_name_len))) { + res = -ERANGE; + goto end_listxattr; + } else + res += copy_name(buffer + res, + strbuf, xattr_name_len); + } + + if (hfs_brec_goto(&fd, 1)) + goto end_listxattr; + } + +end_listxattr: + hfs_find_exit(&fd); + return res; +} + +int hfsplus_removexattr(struct dentry *dentry, const char *name) +{ + int err = 0; + struct inode *inode = dentry->d_inode; + struct hfs_find_data cat_fd; + u16 flags; + u16 cat_entry_type; + int is_xattr_acl_deleted = 0; + int is_all_xattrs_deleted = 0; + + if ((!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) || + HFSPLUS_IS_RSRC(inode)) + return -EOPNOTSUPP; + + if (!HFSPLUS_SB(inode->i_sb)->attr_tree) + return -EOPNOTSUPP; + + err = can_set_xattr(inode, name, NULL, 0); + if (err) + return err; + + if (strncmp(name, XATTR_MAC_OSX_PREFIX, + XATTR_MAC_OSX_PREFIX_LEN) == 0) + name += XATTR_MAC_OSX_PREFIX_LEN; + + if (!strcmp_xattr_finder_info(name)) + return -EOPNOTSUPP; + + err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); + if (err) { + printk(KERN_ERR "hfs: can't init xattr find struct\n"); + return err; + } + + err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); + if (err) { + printk(KERN_ERR "hfs: catalog searching failed\n"); + goto end_removexattr; + } + + err = hfsplus_delete_attr(inode, name); + if (err) + goto end_removexattr; + + is_xattr_acl_deleted = !strcmp_xattr_acl(name); + is_all_xattrs_deleted = !hfsplus_attr_exists(inode, NULL); + + if (!is_xattr_acl_deleted && !is_all_xattrs_deleted) + goto end_removexattr; + + cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset); + + if (cat_entry_type == HFSPLUS_FOLDER) { + flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_folder, flags)); + if (is_xattr_acl_deleted) + flags &= ~HFSPLUS_ACL_EXISTS; + if (is_all_xattrs_deleted) + flags &= ~HFSPLUS_XATTR_EXISTS; + hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_folder, flags), + flags); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } else if (cat_entry_type == HFSPLUS_FILE) { + flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_file, flags)); + if (is_xattr_acl_deleted) + flags &= ~HFSPLUS_ACL_EXISTS; + if (is_all_xattrs_deleted) + flags &= ~HFSPLUS_XATTR_EXISTS; + hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + + offsetof(struct hfsplus_cat_file, flags), + flags); + hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); + } else { + printk(KERN_ERR "hfs: invalid catalog entry type\n"); + err = -EIO; + goto end_removexattr; + } + +end_removexattr: + hfs_find_exit(&cat_fd); + return err; +} + +static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); + strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); + + return hfsplus_getxattr(dentry, xattr_name, buffer, size); +} + +static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); + strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); + + return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); +} + +static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_osx_handler = { + .prefix = XATTR_MAC_OSX_PREFIX, + .list = hfsplus_osx_listxattr, + .get = hfsplus_osx_getxattr, + .set = hfsplus_osx_setxattr, +}; diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h new file mode 100644 index 000000000000..847b695b984d --- /dev/null +++ b/fs/hfsplus/xattr.h @@ -0,0 +1,60 @@ +/* + * linux/fs/hfsplus/xattr.h + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Logic of processing extended attributes + */ + +#ifndef _LINUX_HFSPLUS_XATTR_H +#define _LINUX_HFSPLUS_XATTR_H + +#include <linux/xattr.h> + +extern const struct xattr_handler hfsplus_xattr_osx_handler; +extern const struct xattr_handler hfsplus_xattr_user_handler; +extern const struct xattr_handler hfsplus_xattr_trusted_handler; +/*extern const struct xattr_handler hfsplus_xattr_acl_access_handler;*/ +/*extern const struct xattr_handler hfsplus_xattr_acl_default_handler;*/ +extern const struct xattr_handler hfsplus_xattr_security_handler; + +extern const struct xattr_handler *hfsplus_xattr_handlers[]; + +int __hfsplus_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags); + +static inline int hfsplus_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags); +} + +ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size); + +ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); + +int hfsplus_removexattr(struct dentry *dentry, const char *name); + +int hfsplus_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); + +static inline int hfsplus_init_acl(struct inode *inode, struct inode *dir) +{ + /*TODO: implement*/ + return 0; +} + +static inline int hfsplus_init_inode_security(struct inode *inode, + struct inode *dir, + const struct qstr *qstr) +{ + int err; + + err = hfsplus_init_acl(inode, dir); + if (!err) + err = hfsplus_init_security(inode, dir, qstr); + return err; +} + +#endif diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c new file mode 100644 index 000000000000..83b842f113c5 --- /dev/null +++ b/fs/hfsplus/xattr_security.c @@ -0,0 +1,104 @@ +/* + * linux/fs/hfsplus/xattr_trusted.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handler for storing security labels as extended attributes. + */ + +#include <linux/security.h> +#include "hfsplus_fs.h" +#include "xattr.h" + +static int hfsplus_security_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_SECURITY_PREFIX); + strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name); + + return hfsplus_getxattr(dentry, xattr_name, buffer, size); +} + +static int hfsplus_security_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_SECURITY_PREFIX); + strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name); + + return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); +} + +static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +static int hfsplus_initxattrs(struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t xattr_name_len; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + xattr_name_len = strlen(xattr->name); + + if (xattr_name_len == 0) + continue; + + if (xattr_name_len + XATTR_SECURITY_PREFIX_LEN > + HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_SECURITY_PREFIX); + strcpy(xattr_name + + XATTR_SECURITY_PREFIX_LEN, xattr->name); + memset(xattr_name + + XATTR_SECURITY_PREFIX_LEN + xattr_name_len, 0, 1); + + err = __hfsplus_setxattr(inode, xattr_name, + xattr->value, xattr->value_len, 0); + if (err) + break; + } + return err; +} + +int hfsplus_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &hfsplus_initxattrs, NULL); +} + +const struct xattr_handler hfsplus_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = hfsplus_security_listxattr, + .get = hfsplus_security_getxattr, + .set = hfsplus_security_setxattr, +}; diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c new file mode 100644 index 000000000000..426cee277542 --- /dev/null +++ b/fs/hfsplus/xattr_trusted.c @@ -0,0 +1,63 @@ +/* + * linux/fs/hfsplus/xattr_trusted.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handler for trusted extended attributes. + */ + +#include "hfsplus_fs.h" +#include "xattr.h" + +static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_TRUSTED_PREFIX); + strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name); + + return hfsplus_getxattr(dentry, xattr_name, buffer, size); +} + +static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_TRUSTED_PREFIX); + strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name); + + return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); +} + +static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = hfsplus_trusted_listxattr, + .get = hfsplus_trusted_getxattr, + .set = hfsplus_trusted_setxattr, +}; diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c new file mode 100644 index 000000000000..e34016561ae0 --- /dev/null +++ b/fs/hfsplus/xattr_user.c @@ -0,0 +1,63 @@ +/* + * linux/fs/hfsplus/xattr_user.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handler for user extended attributes. + */ + +#include "hfsplus_fs.h" +#include "xattr.h" + +static int hfsplus_user_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_USER_PREFIX); + strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name); + + return hfsplus_getxattr(dentry, xattr_name, buffer, size); +} + +static int hfsplus_user_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) +{ + char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; + size_t len = strlen(name); + + if (!strcmp(name, "")) + return -EINVAL; + + if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) + return -EOPNOTSUPP; + + strcpy(xattr_name, XATTR_USER_PREFIX); + strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name); + + return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); +} + +static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = hfsplus_user_listxattr, + .get = hfsplus_user_getxattr, + .set = hfsplus_user_setxattr, +}; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 457addc5c91f..0f6e52d22b84 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -30,7 +30,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) return list_entry(inode, struct hostfs_inode_info, vfs_inode); } -#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode) +#define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file)) static int hostfs_d_delete(const struct dentry *dentry) { @@ -845,15 +845,8 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - int error; - - error = inode_newsize_ok(inode, attr->ia_size); - if (error) - return error; - + attr->ia_size != i_size_read(inode)) truncate_setsize(inode, attr->ia_size); - } setattr_copy(inode, attr); mark_inode_dirty(inode); @@ -861,14 +854,6 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr) } static const struct inode_operations hostfs_iops = { - .create = hostfs_create, - .link = hostfs_link, - .unlink = hostfs_unlink, - .symlink = hostfs_symlink, - .mkdir = hostfs_mkdir, - .rmdir = hostfs_rmdir, - .mknod = hostfs_mknod, - .rename = hostfs_rename, .permission = hostfs_permission, .setattr = hostfs_setattr, }; @@ -1001,6 +986,7 @@ static struct file_system_type hostfs_type = { .kill_sb = hostfs_kill_sb, .fs_flags = 0, }; +MODULE_ALIAS_FS("hostfs"); static int __init init_hostfs(void) { diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 78e12b2e0ea2..546f6d39713a 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -25,7 +25,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) loff_t new_off = off + (whence == 1 ? filp->f_pos : 0); loff_t pos; struct quad_buffer_head qbh; - struct inode *i = filp->f_path.dentry->d_inode; + struct inode *i = file_inode(filp); struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct super_block *s = i->i_sb; @@ -57,7 +57,7 @@ fail: static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); struct quad_buffer_head qbh; struct hpfs_dirent *de; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index fbfe2df5624b..9f9dbeceeee7 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -152,7 +152,7 @@ static ssize_t hpfs_file_write(struct file *file, const char __user *buf, retval = do_sync_write(file, buf, count, ppos); if (retval > 0) { hpfs_lock(file->f_path.dentry->d_sb); - hpfs_i(file->f_path.dentry->d_inode)->i_dirty = 1; + hpfs_i(file_inode(file))->i_dirty = 1; hpfs_unlock(file->f_path.dentry->d_sb); } return retval; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 5dc06c837105..9edeeb0ea97e 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -147,7 +147,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode) /*if (le32_to_cpu(fnode->acl_size_l) || le16_to_cpu(fnode->acl_size_s)) { Some unknown structures like ACL may be in fnode, we'd better not overwrite them - hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino); + hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 structures", i->i_ino); } else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) { __le32 ea; if (!uid_eq(i->i_uid, hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) { diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a3076228523d..a0617e706957 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -688,6 +688,7 @@ static struct file_system_type hpfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("hpfs"); static int __init init_hpfs_fs(void) { diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 43b315f2002b..126d3c2e2dee 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -180,7 +180,7 @@ static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count, ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); ssize_t n; - read = file->f_path.dentry->d_inode->i_fop->read; + read = file_inode(file)->i_fop->read; if (!is_user) set_fs(KERNEL_DS); @@ -288,7 +288,7 @@ static ssize_t hppfs_write(struct file *file, const char __user *buf, struct file *proc_file = data->proc_file; ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); - write = proc_file->f_path.dentry->d_inode->i_fop->write; + write = file_inode(proc_file)->i_fop->write; return (*write)(proc_file, buf, len, ppos); } @@ -513,7 +513,7 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where) loff_t (*llseek)(struct file *, loff_t, int); loff_t ret; - llseek = proc_file->f_path.dentry->d_inode->i_fop->llseek; + llseek = file_inode(proc_file)->i_fop->llseek; if (llseek != NULL) { ret = (*llseek)(proc_file, off, where); if (ret < 0) @@ -561,7 +561,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) }); int err; - readdir = proc_file->f_path.dentry->d_inode->i_fop->readdir; + readdir = file_inode(proc_file)->i_fop->readdir; proc_file->f_pos = file->f_pos; err = (*readdir)(proc_file, &dirent, hppfs_filldir); @@ -748,6 +748,7 @@ static struct file_system_type hppfs_type = { .kill_sb = kill_anon_super, .fs_flags = 0, }; +MODULE_ALIAS_FS("hppfs"); static int __init init_hppfs(void) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 78bde32ea951..84e3d856e91d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -97,7 +97,7 @@ static void huge_pagevec_release(struct pagevec *pvec) static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); @@ -896,6 +896,7 @@ static struct file_system_type hugetlbfs_fs_type = { .mount = hugetlbfs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("hugetlbfs"); static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -918,16 +919,25 @@ static int get_hstate_idx(int page_size_log) return h - hstates; } +static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)", + dentry->d_name.name); +} + +static struct dentry_operations anon_ops = { + .d_dname = hugetlb_dname +}; + struct file *hugetlb_file_setup(const char *name, unsigned long addr, size_t size, vm_flags_t acctflag, struct user_struct **user, int creat_flags, int page_size_log) { - int error = -ENOMEM; - struct file *file; + struct file *file = ERR_PTR(-ENOMEM); struct inode *inode; struct path path; - struct dentry *root; + struct super_block *sb; struct qstr quick_string; struct hstate *hstate; unsigned long num_pages; @@ -955,17 +965,18 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, } } - root = hugetlbfs_vfsmount[hstate_idx]->mnt_root; + sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb; quick_string.name = name; quick_string.len = strlen(quick_string.name); quick_string.hash = 0; - path.dentry = d_alloc(root, &quick_string); + path.dentry = d_alloc_pseudo(sb, &quick_string); if (!path.dentry) goto out_shm_unlock; + d_set_d_op(path.dentry, &anon_ops); path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); - error = -ENOSPC; - inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); + file = ERR_PTR(-ENOSPC); + inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out_dentry; @@ -973,7 +984,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, size += addr & ~huge_page_mask(hstate); num_pages = ALIGN(size, huge_page_size(hstate)) >> huge_page_shift(hstate); - error = -ENOMEM; + file = ERR_PTR(-ENOMEM); if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag)) goto out_inode; @@ -981,10 +992,9 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, inode->i_size = size; clear_nlink(inode); - error = -ENFILE; file = alloc_file(&path, FMODE_WRITE | FMODE_READ, &hugetlbfs_file_operations); - if (!file) + if (IS_ERR(file)) goto out_dentry; /* inode is already attached */ return file; @@ -998,7 +1008,7 @@ out_shm_unlock: user_shm_unlock(size, *user); *user = NULL; } - return ERR_PTR(error); + return file; } static int __init init_hugetlbfs_fs(void) diff --git a/fs/inode.c b/fs/inode.c index 14084b72b259..f5f7c06c36fb 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -798,11 +798,10 @@ static struct inode *find_inode(struct super_block *sb, int (*test)(struct inode *, void *), void *data) { - struct hlist_node *node; struct inode *inode = NULL; repeat: - hlist_for_each_entry(inode, node, head, i_hash) { + hlist_for_each_entry(inode, head, i_hash) { spin_lock(&inode->i_lock); if (inode->i_sb != sb) { spin_unlock(&inode->i_lock); @@ -830,11 +829,10 @@ repeat: static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) { - struct hlist_node *node; struct inode *inode = NULL; repeat: - hlist_for_each_entry(inode, node, head, i_hash) { + hlist_for_each_entry(inode, head, i_hash) { spin_lock(&inode->i_lock); if (inode->i_ino != ino) { spin_unlock(&inode->i_lock); @@ -1132,11 +1130,10 @@ EXPORT_SYMBOL(iget_locked); static int test_inode_iunique(struct super_block *sb, unsigned long ino) { struct hlist_head *b = inode_hashtable + hash(sb, ino); - struct hlist_node *node; struct inode *inode; spin_lock(&inode_hash_lock); - hlist_for_each_entry(inode, node, b, i_hash) { + hlist_for_each_entry(inode, b, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb) { spin_unlock(&inode_hash_lock); return 0; @@ -1291,10 +1288,9 @@ int insert_inode_locked(struct inode *inode) struct hlist_head *head = inode_hashtable + hash(sb, ino); while (1) { - struct hlist_node *node; struct inode *old = NULL; spin_lock(&inode_hash_lock); - hlist_for_each_entry(old, node, head, i_hash) { + hlist_for_each_entry(old, head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) @@ -1306,7 +1302,7 @@ int insert_inode_locked(struct inode *inode) } break; } - if (likely(!node)) { + if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); @@ -1334,11 +1330,10 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(sb, hashval); while (1) { - struct hlist_node *node; struct inode *old = NULL; spin_lock(&inode_hash_lock); - hlist_for_each_entry(old, node, head, i_hash) { + hlist_for_each_entry(old, head, i_hash) { if (old->i_sb != sb) continue; if (!test(old, data)) @@ -1350,7 +1345,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, } break; } - if (likely(!node)) { + if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); @@ -1655,7 +1650,7 @@ EXPORT_SYMBOL(file_remove_suid); int file_update_time(struct file *file) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct timespec now; int sync_it = 0; int ret; diff --git a/fs/internal.h b/fs/internal.h index 2f6af7f645eb..4be78237d896 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -69,7 +69,7 @@ extern void __mnt_drop_write_file(struct file *); /* * fs_struct.c */ -extern void chroot_fs_refs(struct path *, struct path *); +extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c @@ -125,3 +125,8 @@ extern int invalidate_inodes(struct super_block *, bool); * dcache.c */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); + +/* + * read_write.c + */ +extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); diff --git a/fs/ioctl.c b/fs/ioctl.c index 3bdad6d1f268..fd507fb460f8 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -175,7 +175,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) struct fiemap fiemap; struct fiemap __user *ufiemap = (struct fiemap __user *) arg; struct fiemap_extent_info fieinfo = { 0, }; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; u64 len; int error; @@ -424,7 +424,7 @@ EXPORT_SYMBOL(generic_block_fiemap); */ int ioctl_preallocate(struct file *filp, void __user *argp) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct space_resv sr; if (copy_from_user(&sr, argp, sizeof(sr))) @@ -449,7 +449,7 @@ int ioctl_preallocate(struct file *filp, void __user *argp) static int file_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); int __user *p = (int __user *)arg; switch (cmd) { @@ -512,7 +512,7 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp, static int ioctl_fsfreeze(struct file *filp) { - struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; + struct super_block *sb = file_inode(filp)->i_sb; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -527,7 +527,7 @@ static int ioctl_fsfreeze(struct file *filp) static int ioctl_fsthaw(struct file *filp) { - struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; + struct super_block *sb = file_inode(filp)->i_sb; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -548,7 +548,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, { int error = 0; int __user *argp = (int __user *)arg; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); switch (cmd) { case FIOCLEX: diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 0b3fa7974fa8..592e5115a561 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -296,7 +296,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, */ static int zisofs_readpage(struct file *file, struct page *page) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; int err; int i, pcount, full_page; diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index f20437c068a0..a7d5c3c3d4e6 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -253,7 +253,7 @@ static int isofs_readdir(struct file *filp, int result; char *tmpname; struct iso_directory_record *tmpde; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); tmpname = (char *)__get_free_page(GFP_KERNEL); if (tmpname == NULL) diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 2b4f2358eadb..12088d8de3fa 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -125,10 +125,10 @@ isofs_export_encode_fh(struct inode *inode, */ if (parent && (len < 5)) { *max_len = 5; - return 255; + return FILEID_INVALID; } else if (len < 3) { *max_len = 3; - return 255; + return FILEID_INVALID; } len = 3; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 67ce52507d7d..d9b8aebdeb22 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1556,6 +1556,8 @@ static struct file_system_type iso9660_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("iso9660"); +MODULE_ALIAS("iso9660"); static int __init init_iso9660_fs(void) { @@ -1593,5 +1595,3 @@ static void __exit exit_iso9660_fs(void) module_init(init_iso9660_fs) module_exit(exit_iso9660_fs) MODULE_LICENSE("GPL"); -/* Actual filesystem name is iso9660, as requested in filesystems.c */ -MODULE_ALIAS("iso9660"); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3091d42992f0..750c70148eff 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -435,7 +435,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) trace_jbd2_commit_locking(journal, commit_transaction); stats.run.rs_wait = commit_transaction->t_max_wait; + stats.run.rs_request_delay = 0; stats.run.rs_locked = jiffies; + if (commit_transaction->t_requested) + stats.run.rs_request_delay = + jbd2_time_diff(commit_transaction->t_requested, + stats.run.rs_locked); stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, stats.run.rs_locked); @@ -1116,7 +1121,10 @@ restart_loop: */ spin_lock(&journal->j_history_lock); journal->j_stats.ts_tid++; + if (commit_transaction->t_requested) + journal->j_stats.ts_requested++; journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; journal->j_stats.run.rs_running += stats.run.rs_running; journal->j_stats.run.rs_locked += stats.run.rs_locked; journal->j_stats.run.rs_flushing += stats.run.rs_flushing; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index dbf41f9452db..ed10991ab006 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -35,7 +35,6 @@ #include <linux/kthread.h> #include <linux/poison.h> #include <linux/proc_fs.h> -#include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/math64.h> #include <linux/hash.h> @@ -51,6 +50,14 @@ #include <asm/uaccess.h> #include <asm/page.h> +#ifdef CONFIG_JBD2_DEBUG +ushort jbd2_journal_enable_debug __read_mostly; +EXPORT_SYMBOL(jbd2_journal_enable_debug); + +module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644); +MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2"); +#endif + EXPORT_SYMBOL(jbd2_journal_extend); EXPORT_SYMBOL(jbd2_journal_stop); EXPORT_SYMBOL(jbd2_journal_lock_updates); @@ -513,6 +520,10 @@ int __jbd2_log_space_left(journal_t *journal) */ int __jbd2_log_start_commit(journal_t *journal, tid_t target) { + /* Return if the txn has already requested to be committed */ + if (journal->j_commit_request == target) + return 0; + /* * The only transaction we can possibly wait upon is the * currently running transaction (if it exists). Otherwise, @@ -529,6 +540,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) jbd_debug(1, "JBD2: requesting commit %d/%d\n", journal->j_commit_request, journal->j_commit_sequence); + journal->j_running_transaction->t_requested = jiffies; wake_up(&journal->j_wait_commit); return 1; } else if (!tid_geq(journal->j_commit_request, target)) @@ -894,13 +906,18 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v) if (v != SEQ_START_TOKEN) return 0; - seq_printf(seq, "%lu transaction, each up to %u blocks\n", - s->stats->ts_tid, - s->journal->j_max_transaction_buffers); + seq_printf(seq, "%lu transactions (%lu requested), " + "each up to %u blocks\n", + s->stats->ts_tid, s->stats->ts_requested, + s->journal->j_max_transaction_buffers); if (s->stats->ts_tid == 0) return 0; seq_printf(seq, "average: \n %ums waiting for transaction\n", jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid)); + seq_printf(seq, " %ums request delay\n", + (s->stats->ts_requested == 0) ? 0 : + jiffies_to_msecs(s->stats->run.rs_request_delay / + s->stats->ts_requested)); seq_printf(seq, " %ums running transaction\n", jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid)); seq_printf(seq, " %ums transaction was being locked\n", @@ -2485,45 +2502,6 @@ restart: spin_unlock(&journal->j_list_lock); } -/* - * debugfs tunables - */ -#ifdef CONFIG_JBD2_DEBUG -u8 jbd2_journal_enable_debug __read_mostly; -EXPORT_SYMBOL(jbd2_journal_enable_debug); - -#define JBD2_DEBUG_NAME "jbd2-debug" - -static struct dentry *jbd2_debugfs_dir; -static struct dentry *jbd2_debug; - -static void __init jbd2_create_debugfs_entry(void) -{ - jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL); - if (jbd2_debugfs_dir) - jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, - S_IRUGO | S_IWUSR, - jbd2_debugfs_dir, - &jbd2_journal_enable_debug); -} - -static void __exit jbd2_remove_debugfs_entry(void) -{ - debugfs_remove(jbd2_debug); - debugfs_remove(jbd2_debugfs_dir); -} - -#else - -static void __init jbd2_create_debugfs_entry(void) -{ -} - -static void __exit jbd2_remove_debugfs_entry(void) -{ -} - -#endif #ifdef CONFIG_PROC_FS @@ -2609,7 +2587,6 @@ static int __init journal_init(void) ret = journal_init_caches(); if (ret == 0) { - jbd2_create_debugfs_entry(); jbd2_create_jbd_stats_proc_entry(); } else { jbd2_journal_destroy_caches(); @@ -2624,7 +2601,6 @@ static void __exit journal_exit(void) if (n) printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n); #endif - jbd2_remove_debugfs_entry(); jbd2_remove_jbd_stats_proc_entry(); jbd2_journal_destroy_caches(); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index df9f29760efa..325bc019ed88 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -30,6 +30,8 @@ #include <linux/bug.h> #include <linux/module.h> +#include <trace/events/jbd2.h> + static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); static void __jbd2_journal_unfile_buffer(struct journal_head *jh); @@ -100,6 +102,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) journal->j_running_transaction = transaction; transaction->t_max_wait = 0; transaction->t_start = jiffies; + transaction->t_requested = 0; return transaction; } @@ -306,6 +309,8 @@ repeat: */ update_t_max_wait(transaction, ts); handle->h_transaction = transaction; + handle->h_requested_credits = nblocks; + handle->h_start_jiffies = jiffies; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", @@ -352,7 +357,8 @@ static handle_t *new_handle(int nblocks) * Return a pointer to a newly allocated handle, or an ERR_PTR() value * on failure. */ -handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, + unsigned int type, unsigned int line_no) { handle_t *handle = journal_current_handle(); int err; @@ -376,8 +382,13 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask) if (err < 0) { jbd2_free_handle(handle); current->journal_info = NULL; - handle = ERR_PTR(err); + return ERR_PTR(err); } + handle->h_type = type; + handle->h_line_no = line_no; + trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, type, + line_no, nblocks); return handle; } EXPORT_SYMBOL(jbd2__journal_start); @@ -385,7 +396,7 @@ EXPORT_SYMBOL(jbd2__journal_start); handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { - return jbd2__journal_start(journal, nblocks, GFP_NOFS); + return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0); } EXPORT_SYMBOL(jbd2_journal_start); @@ -447,7 +458,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) goto unlock; } + trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, + handle->h_type, handle->h_line_no, + handle->h_buffer_credits, + nblocks); + handle->h_buffer_credits += nblocks; + handle->h_requested_credits += nblocks; atomic_add(nblocks, &transaction->t_outstanding_credits); result = 0; @@ -1047,9 +1065,12 @@ out: void jbd2_journal_set_triggers(struct buffer_head *bh, struct jbd2_buffer_trigger_type *type) { - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh = jbd2_journal_grab_journal_head(bh); + if (WARN_ON(!jh)) + return; jh->b_triggers = type; + jbd2_journal_put_journal_head(jh); } void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, @@ -1101,17 +1122,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh; int ret = 0; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); if (is_handle_aborted(handle)) goto out; - if (!buffer_jbd(bh)) { + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { ret = -EUCLEAN; goto out; } + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); jbd_lock_bh_state(bh); @@ -1202,6 +1224,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); + jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); WARN_ON(ret); /* All errors are bugs, so dump the stack */ @@ -1376,6 +1399,13 @@ int jbd2_journal_stop(handle_t *handle) } jbd_debug(4, "Handle %p going down\n", handle); + trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, + handle->h_type, handle->h_line_no, + jiffies - handle->h_start_jiffies, + handle->h_sync, handle->h_requested_credits, + (handle->h_requested_credits - + handle->h_buffer_credits)); /* * Implement synchronous transaction batching. If the handle diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig index 6ae169cd8faa..d8bb6c411e96 100644 --- a/fs/jffs2/Kconfig +++ b/fs/jffs2/Kconfig @@ -50,8 +50,8 @@ config JFFS2_FS_WBUF_VERIFY write-buffer, and check for errors. config JFFS2_SUMMARY - bool "JFFS2 summary support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL + bool "JFFS2 summary support" + depends on JFFS2_FS default n help This feature makes it possible to use summary information @@ -63,8 +63,8 @@ config JFFS2_SUMMARY If unsure, say 'N'. config JFFS2_FS_XATTR - bool "JFFS2 XATTR support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL + bool "JFFS2 XATTR support" + depends on JFFS2_FS default n help Extended attributes are name:value pairs associated with inodes by @@ -173,7 +173,7 @@ config JFFS2_CMODE_PRIORITY successful one. config JFFS2_CMODE_SIZE - bool "size (EXPERIMENTAL)" + bool "size" help Tries all compressors and chooses the one which has the smallest result. diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index ad7774d32095..acd46a4160cb 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -117,12 +117,12 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) { struct jffs2_inode_info *f; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct jffs2_full_dirent *fd; unsigned long offset, curofs; jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", - filp->f_path.dentry->d_inode->i_ino); + file_inode(filp)->i_ino); f = JFFS2_INODE_INFO(inode); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index d3d8799e2187..0defb1cc2a35 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -356,6 +356,7 @@ static struct file_system_type jffs2_fs_type = { .mount = jffs2_mount, .kill_sb = jffs2_kill_sb, }; +MODULE_ALIAS_FS("jffs2"); static int __init init_jffs2_fs(void) { diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index bc555ff417e9..93a1232894f6 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -58,7 +58,7 @@ static long jfs_map_ext2(unsigned long flags, int from) long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct jfs_inode_info *jfs_inode = JFS_IP(inode); unsigned int flags; diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 9197a1b0d02d..0ddbeceafc62 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -3004,7 +3004,7 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) */ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *ip = filp->f_path.dentry->d_inode; + struct inode *ip = file_inode(filp); struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; int rc = 0; loff_t dtpos; /* legacy OS/2 style position */ diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 1a543be09c79..2003e830ed1c 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -154,7 +154,7 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) /* * If we really return the number of allocated & free inodes, some * applications will fail because they won't see enough free inodes. - * We'll try to calculate some guess as to how may inodes we can + * We'll try to calculate some guess as to how many inodes we can * really allocate * * buf->f_files = atomic_read(&imap->im_numinos); @@ -833,6 +833,7 @@ static struct file_system_type jfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("jfs"); static void init_once(void *foo) { diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index ca0a08001449..0796c45d0d4d 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -11,7 +11,7 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/nfs_fs.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> #include <linux/kthread.h> @@ -178,7 +178,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) continue; if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) continue; - if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) + if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)) ,fh) != 0) continue; /* Alright, we found a lock. Set the return status * and wake up the caller @@ -220,10 +220,19 @@ reclaimer(void *ptr) { struct nlm_host *host = (struct nlm_host *) ptr; struct nlm_wait *block; + struct nlm_rqst *req; struct file_lock *fl, *next; u32 nsmstate; struct net *net = host->net; + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) { + printk(KERN_ERR "lockd: reclaimer unable to alloc memory." + " Locks for %s won't be reclaimed!\n", + host->h_name); + return 0; + } + allow_signal(SIGKILL); down_write(&host->h_rwsem); @@ -253,7 +262,7 @@ restart: */ if (signalled()) continue; - if (nlmclnt_reclaim(host, fl) != 0) + if (nlmclnt_reclaim(host, fl, req) != 0) continue; list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted); if (host->h_nsmstate != nsmstate) { @@ -279,5 +288,6 @@ restart: /* Release host handle after use */ nlmclnt_release_host(host); lockd_down(net); + kfree(req); return 0; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 54f9e6ce0430..7e529c3c45c0 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -127,7 +127,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) struct nlm_lock *lock = &argp->lock; nlmclnt_next_cookie(&argp->cookie); - memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh)); + memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); lock->caller = utsname()->nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", @@ -550,6 +550,9 @@ again: status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); if (status < 0) break; + /* Resend the blocking lock request after a server reboot */ + if (resp->status == nlm_lck_denied_grace_period) + continue; if (resp->status != nlm_lck_blocked) break; } @@ -615,17 +618,15 @@ out_unlock: * RECLAIM: Try to reclaim a lock */ int -nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl) +nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl, + struct nlm_rqst *req) { - struct nlm_rqst reqst, *req; int status; - req = &reqst; memset(req, 0, sizeof(*req)); locks_init_lock(&req->a_args.lock.fl); locks_init_lock(&req->a_res.lock.fl); req->a_host = host; - req->a_flags = 0; /* Set up the argument struct */ nlmclnt_setlockargs(req, fl); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 0e17090c310f..969d589c848d 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -13,6 +13,7 @@ #include <linux/in.h> #include <linux/in6.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> #include <linux/mutex.h> @@ -32,15 +33,15 @@ static struct hlist_head nlm_server_hosts[NLM_HOST_NRHASH]; static struct hlist_head nlm_client_hosts[NLM_HOST_NRHASH]; -#define for_each_host(host, pos, chain, table) \ +#define for_each_host(host, chain, table) \ for ((chain) = (table); \ (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \ - hlist_for_each_entry((host), (pos), (chain), h_hash) + hlist_for_each_entry((host), (chain), h_hash) -#define for_each_host_safe(host, pos, next, chain, table) \ +#define for_each_host_safe(host, next, chain, table) \ for ((chain) = (table); \ (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \ - hlist_for_each_entry_safe((host), (pos), (next), \ + hlist_for_each_entry_safe((host), (next), \ (chain), h_hash) static unsigned long nrhosts; @@ -225,7 +226,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, .net = net, }; struct hlist_head *chain; - struct hlist_node *pos; struct nlm_host *host; struct nsm_handle *nsm = NULL; struct lockd_net *ln = net_generic(net, lockd_net_id); @@ -237,7 +237,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, mutex_lock(&nlm_host_mutex); chain = &nlm_client_hosts[nlm_hash_address(sap)]; - hlist_for_each_entry(host, pos, chain, h_hash) { + hlist_for_each_entry(host, chain, h_hash) { if (host->net != net) continue; if (!rpc_cmp_addr(nlm_addr(host), sap)) @@ -322,7 +322,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, const size_t hostname_len) { struct hlist_head *chain; - struct hlist_node *pos; struct nlm_host *host = NULL; struct nsm_handle *nsm = NULL; struct sockaddr *src_sap = svc_daddr(rqstp); @@ -350,7 +349,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, nlm_gc_hosts(net); chain = &nlm_server_hosts[nlm_hash_address(ni.sap)]; - hlist_for_each_entry(host, pos, chain, h_hash) { + hlist_for_each_entry(host, chain, h_hash) { if (host->net != net) continue; if (!rpc_cmp_addr(nlm_addr(host), ni.sap)) @@ -515,10 +514,9 @@ static struct nlm_host *next_host_state(struct hlist_head *cache, { struct nlm_host *host; struct hlist_head *chain; - struct hlist_node *pos; mutex_lock(&nlm_host_mutex); - for_each_host(host, pos, chain, cache) { + for_each_host(host, chain, cache) { if (host->h_nsmhandle == nsm && host->h_nsmstate != info->state) { host->h_nsmstate = info->state; @@ -570,7 +568,6 @@ void nlm_host_rebooted(const struct nlm_reboot *info) static void nlm_complain_hosts(struct net *net) { struct hlist_head *chain; - struct hlist_node *pos; struct nlm_host *host; if (net) { @@ -587,7 +584,7 @@ static void nlm_complain_hosts(struct net *net) dprintk("lockd: %lu hosts left:\n", nrhosts); } - for_each_host(host, pos, chain, nlm_server_hosts) { + for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; dprintk(" %s (cnt %d use %d exp %ld net %p)\n", @@ -600,14 +597,13 @@ void nlm_shutdown_hosts_net(struct net *net) { struct hlist_head *chain; - struct hlist_node *pos; struct nlm_host *host; mutex_lock(&nlm_host_mutex); /* First, make all hosts eligible for gc */ dprintk("lockd: nuking all hosts in net %p...\n", net); - for_each_host(host, pos, chain, nlm_server_hosts) { + for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; host->h_expires = jiffies - 1; @@ -644,11 +640,11 @@ static void nlm_gc_hosts(struct net *net) { struct hlist_head *chain; - struct hlist_node *pos, *next; + struct hlist_node *next; struct nlm_host *host; dprintk("lockd: host garbage collection for net %p\n", net); - for_each_host(host, pos, chain, nlm_server_hosts) { + for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; host->h_inuse = 0; @@ -657,7 +653,7 @@ nlm_gc_hosts(struct net *net) /* Mark all hosts that hold locks, blocks or shares */ nlmsvc_mark_resources(net); - for_each_host_safe(host, pos, next, chain, nlm_server_hosts) { + for_each_host_safe(host, next, chain, nlm_server_hosts) { if (net && host->net != net) continue; if (atomic_read(&host->h_count) || host->h_inuse diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 3c2cfc683631..1812f026960c 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtsock.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 8d80c990dffd..e703318c41df 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -406,8 +406,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, __be32 ret; dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", - file->f_file->f_path.dentry->d_inode->i_sb->s_id, - file->f_file->f_path.dentry->d_inode->i_ino, + file_inode(file->f_file)->i_sb->s_id, + file_inode(file->f_file)->i_ino, lock->fl.fl_type, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, @@ -513,8 +513,8 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, __be32 ret; dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", - file->f_file->f_path.dentry->d_inode->i_sb->s_id, - file->f_file->f_path.dentry->d_inode->i_ino, + file_inode(file->f_file)->i_sb->s_id, + file_inode(file->f_file)->i_ino, lock->fl.fl_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -606,8 +606,8 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) int error; dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", - file->f_file->f_path.dentry->d_inode->i_sb->s_id, - file->f_file->f_path.dentry->d_inode->i_ino, + file_inode(file->f_file)->i_sb->s_id, + file_inode(file->f_file)->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -635,8 +635,8 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l int status = 0; dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", - file->f_file->f_path.dentry->d_inode->i_sb->s_id, - file->f_file->f_path.dentry->d_inode->i_ino, + file_inode(file->f_file)->i_sb->s_id, + file_inode(file->f_file)->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 0deb5f6c9dd4..97e87415b145 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -13,7 +13,7 @@ #include <linux/slab.h> #include <linux/mutex.h> #include <linux/sunrpc/svc.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/nfsd/nfsfh.h> #include <linux/nfsd/export.h> #include <linux/lockd/lockd.h> @@ -45,7 +45,7 @@ static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) { - struct inode *inode = file->f_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file->f_file); dprintk("lockd: %s %s/%ld\n", msg, inode->i_sb->s_id, inode->i_ino); @@ -84,7 +84,6 @@ __be32 nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, struct nfs_fh *f) { - struct hlist_node *pos; struct nlm_file *file; unsigned int hash; __be32 nfserr; @@ -96,7 +95,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, /* Lock file table */ mutex_lock(&nlm_file_mutex); - hlist_for_each_entry(file, pos, &nlm_files[hash], f_list) + hlist_for_each_entry(file, &nlm_files[hash], f_list) if (!nfs_compare_fh(&file->f_handle, f)) goto found; @@ -248,13 +247,13 @@ static int nlm_traverse_files(void *data, nlm_host_match_fn_t match, int (*is_failover_file)(void *data, struct nlm_file *file)) { - struct hlist_node *pos, *next; + struct hlist_node *next; struct nlm_file *file; int i, ret = 0; mutex_lock(&nlm_file_mutex); for (i = 0; i < FILE_NRHASH; i++) { - hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) { + hlist_for_each_entry_safe(file, next, &nlm_files[i], f_list) { if (is_failover_file && !is_failover_file(data, file)) continue; file->f_count++; diff --git a/fs/locks.c b/fs/locks.c index a94e331a52a2..cb424a4fed71 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -334,7 +334,7 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, start = filp->f_pos; break; case SEEK_END: - start = i_size_read(filp->f_path.dentry->d_inode); + start = i_size_read(file_inode(filp)); break; default: return -EINVAL; @@ -384,7 +384,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, start = filp->f_pos; break; case SEEK_END: - start = i_size_read(filp->f_path.dentry->d_inode); + start = i_size_read(file_inode(filp)); break; default: return -EINVAL; @@ -627,7 +627,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) struct file_lock *cfl; lock_flocks(); - for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) { + for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) { if (!IS_POSIX(cfl)) continue; if (posix_locks_conflict(fl, cfl)) @@ -708,7 +708,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) { struct file_lock *new_fl = NULL; struct file_lock **before; - struct inode * inode = filp->f_path.dentry->d_inode; + struct inode * inode = file_inode(filp); int error = 0; int found = 0; @@ -1002,7 +1002,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return __posix_lock_file(filp->f_path.dentry->d_inode, fl, conflock); + return __posix_lock_file(file_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); @@ -1326,8 +1326,8 @@ int fcntl_getlease(struct file *filp) int type = F_UNLCK; lock_flocks(); - time_out_leases(filp->f_path.dentry->d_inode); - for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); + time_out_leases(file_inode(filp)); + for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); fl = fl->fl_next) { if (fl->fl_file == filp) { type = target_leasetype(fl); @@ -1843,7 +1843,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (copy_from_user(&flock, l, sizeof(flock))) goto out; - inode = filp->f_path.dentry->d_inode; + inode = file_inode(filp); /* Don't allow mandatory locks on files that may be memory mapped * and shared. @@ -1961,7 +1961,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, if (copy_from_user(&flock, l, sizeof(flock))) goto out; - inode = filp->f_path.dentry->d_inode; + inode = file_inode(filp); /* Don't allow mandatory locks on files that may be memory mapped * and shared. @@ -2030,7 +2030,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ - if (!filp->f_path.dentry->d_inode->i_flock) + if (!file_inode(filp)->i_flock) return; lock.fl_type = F_UNLCK; @@ -2056,7 +2056,7 @@ EXPORT_SYMBOL(locks_remove_posix); */ void locks_remove_flock(struct file *filp) { - struct inode * inode = filp->f_path.dentry->d_inode; + struct inode * inode = file_inode(filp); struct file_lock *fl; struct file_lock **before; @@ -2152,7 +2152,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, fl_pid = fl->fl_pid; if (fl->fl_file != NULL) - inode = fl->fl_file->f_path.dentry->d_inode; + inode = file_inode(fl->fl_file); seq_printf(f, "%lld:%s ", id, pfx); if (IS_POSIX(fl)) { diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig index daf9a9b32dd3..09ed066c0221 100644 --- a/fs/logfs/Kconfig +++ b/fs/logfs/Kconfig @@ -1,6 +1,6 @@ config LOGFS - tristate "LogFS file system (EXPERIMENTAL)" - depends on (MTD || BLOCK) && EXPERIMENTAL + tristate "LogFS file system" + depends on (MTD || BLOCK) select ZLIB_INFLATE select ZLIB_DEFLATE select CRC32 diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 26e4a941532f..b82751082112 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -284,7 +284,7 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry) #define IMPLICIT_NODES 2 static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir) { - struct inode *dir = file->f_dentry->d_inode; + struct inode *dir = file_inode(file); loff_t pos = file->f_pos - IMPLICIT_NODES; struct page *page; struct logfs_disk_dentry *dd; @@ -320,7 +320,7 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir) static int logfs_readdir(struct file *file, void *buf, filldir_t filldir) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); ino_t pino = parent_ino(file->f_dentry); int err; diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 3886cded283c..c2219a6dd3c8 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -183,7 +183,7 @@ static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this) long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct logfs_inode *li = logfs_inode(inode); unsigned int oldflags, flags; int err; diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 345c24b8a6f8..54360293bcb5 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -608,6 +608,7 @@ static struct file_system_type logfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("logfs"); static int __init logfs_init(void) { diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 685b2d981b87..a9ed6f36e6ea 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -85,7 +85,7 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) { unsigned long pos = filp->f_pos; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; unsigned offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 99541cceb584..df122496f328 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -660,6 +660,7 @@ static struct file_system_type minix_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("minix"); static int __init init_minix_fs(void) { diff --git a/fs/namei.c b/fs/namei.c index 43a97ee1d4c8..57ae9c8c66bf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -451,7 +451,7 @@ int inode_permission(struct inode *inode, int mask) * * Given a path increment the reference count to the dentry and the vfsmount. */ -void path_get(struct path *path) +void path_get(const struct path *path) { mntget(path->mnt); dget(path->dentry); @@ -464,7 +464,7 @@ EXPORT_SYMBOL(path_get); * * Given a path decrement the reference count to the dentry and the vfsmount. */ -void path_put(struct path *path) +void path_put(const struct path *path) { dput(path->dentry); mntput(path->mnt); @@ -600,14 +600,10 @@ static int complete_walk(struct nameidata *nd) if (likely(!(nd->flags & LOOKUP_JUMPED))) return 0; - if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) + if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) return 0; - if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) - return 0; - - /* Note: we do not d_invalidate() */ - status = d_revalidate(dentry, nd->flags); + status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); if (status > 0) return 0; @@ -693,8 +689,6 @@ void nd_jump_link(struct nameidata *nd, struct path *path) nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->flags |= LOOKUP_JUMPED; - - BUG_ON(nd->inode->i_op->follow_link); } static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) @@ -1342,7 +1336,7 @@ static struct dentry *__lookup_hash(struct qstr *name, * small and for now I'd prefer to have fast path as straight as possible. * It _is_ time-critical. */ -static int lookup_fast(struct nameidata *nd, struct qstr *name, +static int lookup_fast(struct nameidata *nd, struct path *path, struct inode **inode) { struct vfsmount *mnt = nd->path.mnt; @@ -1358,7 +1352,7 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - dentry = __d_lookup_rcu(parent, name, &seq, nd->inode); + dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode); if (!dentry) goto unlazy; @@ -1400,7 +1394,7 @@ unlazy: if (unlazy_walk(nd, dentry)) return -ECHILD; } else { - dentry = __d_lookup(parent, name); + dentry = __d_lookup(parent, &nd->last); } if (unlikely(!dentry)) @@ -1436,8 +1430,7 @@ need_lookup: } /* Fast lookup failed, do it the slow way */ -static int lookup_slow(struct nameidata *nd, struct qstr *name, - struct path *path) +static int lookup_slow(struct nameidata *nd, struct path *path) { struct dentry *dentry, *parent; int err; @@ -1446,7 +1439,7 @@ static int lookup_slow(struct nameidata *nd, struct qstr *name, BUG_ON(nd->inode != parent->d_inode); mutex_lock(&parent->d_inode->i_mutex); - dentry = __lookup_hash(name, parent, nd->flags); + dentry = __lookup_hash(&nd->last, parent, nd->flags); mutex_unlock(&parent->d_inode->i_mutex); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -1519,7 +1512,7 @@ static inline int should_follow_link(struct inode *inode, int follow) } static inline int walk_component(struct nameidata *nd, struct path *path, - struct qstr *name, int type, int follow) + int follow) { struct inode *inode; int err; @@ -1528,14 +1521,14 @@ static inline int walk_component(struct nameidata *nd, struct path *path, * to be able to know about the current root directory and * parent relationships. */ - if (unlikely(type != LAST_NORM)) - return handle_dots(nd, type); - err = lookup_fast(nd, name, path, &inode); + if (unlikely(nd->last_type != LAST_NORM)) + return handle_dots(nd, nd->last_type); + err = lookup_fast(nd, path, &inode); if (unlikely(err)) { if (err < 0) goto out_err; - err = lookup_slow(nd, name, path); + err = lookup_slow(nd, path); if (err < 0) goto out_err; @@ -1594,8 +1587,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) res = follow_link(&link, nd, &cookie); if (res) break; - res = walk_component(nd, path, &nd->last, - nd->last_type, LOOKUP_FOLLOW); + res = walk_component(nd, path, LOOKUP_FOLLOW); put_link(nd, &link, cookie); } while (res > 0); @@ -1802,8 +1794,11 @@ static int link_path_walk(const char *name, struct nameidata *nd) } } + nd->last = this; + nd->last_type = type; + if (!name[len]) - goto last_component; + return 0; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. @@ -1812,10 +1807,11 @@ static int link_path_walk(const char *name, struct nameidata *nd) len++; } while (unlikely(name[len] == '/')); if (!name[len]) - goto last_component; + return 0; + name += len; - err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); + err = walk_component(nd, &next, LOOKUP_FOLLOW); if (err < 0) return err; @@ -1824,16 +1820,10 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) return err; } - if (can_lookup(nd->inode)) - continue; - err = -ENOTDIR; - break; - /* here ends the main loop */ - -last_component: - nd->last = this; - nd->last_type = type; - return 0; + if (!can_lookup(nd->inode)) { + err = -ENOTDIR; + break; + } } terminate_walk(nd); return err; @@ -1932,8 +1922,7 @@ static inline int lookup_last(struct nameidata *nd, struct path *path) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; nd->flags &= ~LOOKUP_PARENT; - return walk_component(nd, path, &nd->last, nd->last_type, - nd->flags & LOOKUP_FOLLOW); + return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -2732,7 +2721,7 @@ static int do_last(struct nameidata *nd, struct path *path, if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) symlink_ok = true; /* we _can_ be in RCU mode here */ - error = lookup_fast(nd, &nd->last, path, &inode); + error = lookup_fast(nd, path, &inode); if (likely(!error)) goto finish_lookup; @@ -2778,7 +2767,7 @@ retry_lookup: goto out; if ((*opened & FILE_CREATED) || - !S_ISREG(file->f_path.dentry->d_inode->i_mode)) + !S_ISREG(file_inode(file)->i_mode)) will_truncate = false; audit_inode(name, file->f_path.dentry, 0); @@ -2941,8 +2930,8 @@ static struct file *path_openat(int dfd, struct filename *pathname, int error; file = get_empty_filp(); - if (!file) - return ERR_PTR(-ENFILE); + if (IS_ERR(file)) + return file; file->f_flags = op->open_flag; diff --git a/fs/namespace.c b/fs/namespace.c index 55605c552787..d581e45c0a9f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -384,7 +384,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write); */ int __mnt_want_write_file(struct file *file) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) return __mnt_want_write(file->f_path.mnt); @@ -798,6 +798,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + /* Don't allow unprivileged users to change mount flags */ + if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); @@ -1237,6 +1241,14 @@ static int do_umount(struct mount *mnt, int flags) return retval; } +/* + * Is the caller allowed to modify his namespace? + */ +static inline bool may_mount(void) +{ + return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); +} + /* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. @@ -1255,6 +1267,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) return -EINVAL; + if (!may_mount()) + return -EPERM; + if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; @@ -1268,10 +1283,6 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) if (!check_mnt(mnt)) goto dput_and_out; - retval = -EPERM; - if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) - goto dput_and_out; - retval = do_umount(mnt, flags); dput_and_out: /* we mustn't call path_put() as that would clear mnt_expiry_mark */ @@ -1293,24 +1304,6 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) #endif -static int mount_is_safe(struct path *path) -{ - if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) - return 0; - return -EPERM; -#ifdef notyet - if (S_ISLNK(path->dentry->d_inode->i_mode)) - return -EPERM; - if (path->dentry->d_inode->i_mode & S_ISVTX) { - if (current_uid() != path->dentry->d_inode->i_uid) - return -EPERM; - } - if (inode_permission(path->dentry->d_inode, MAY_WRITE)) - return -EPERM; - return 0; -#endif -} - static bool mnt_ns_loop(struct path *path) { /* Could bind mounting the mount namespace inode cause a @@ -1633,9 +1626,6 @@ static int do_change_type(struct path *path, int flag) int type; int err = 0; - if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; - if (path->dentry != path->mnt->mnt_root) return -EINVAL; @@ -1669,9 +1659,7 @@ static int do_loopback(struct path *path, const char *old_name, LIST_HEAD(umount_list); struct path old_path; struct mount *mnt = NULL, *old; - int err = mount_is_safe(path); - if (err) - return err; + int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); @@ -1729,6 +1717,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; + if (mnt->mnt_flags & MNT_LOCK_READONLY) + return -EPERM; + if (readonly_request) error = mnt_make_readonly(real_mount(mnt)); else @@ -1748,9 +1739,6 @@ static int do_remount(struct path *path, int flags, int mnt_flags, struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (!check_mnt(mnt)) return -EINVAL; @@ -1764,6 +1752,8 @@ static int do_remount(struct path *path, int flags, int mnt_flags, down_write(&sb->s_umount); if (flags & MS_BIND) err = change_mount_flags(path->mnt, flags); + else if (!capable(CAP_SYS_ADMIN)) + err = -EPERM; else err = do_remount_sb(sb, flags, data, 0); if (!err) { @@ -1796,9 +1786,7 @@ static int do_move_mount(struct path *path, const char *old_name) struct path old_path, parent_path; struct mount *p; struct mount *old; - int err = 0; - if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; + int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); @@ -1933,18 +1921,13 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; - struct user_namespace *user_ns; + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct vfsmount *mnt; int err; if (!fstype) return -EINVAL; - /* we need capabilities... */ - user_ns = real_mount(path->mnt)->mnt_ns->user_ns; - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) - return -EPERM; - type = get_fs_type(fstype); if (!type) return -ENODEV; @@ -2258,6 +2241,9 @@ long do_mount(const char *dev_name, const char *dir_name, if (retval) goto dput_out; + if (!may_mount()) + return -EPERM; + /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) mnt_flags |= MNT_RELATIME; @@ -2360,7 +2346,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_ALL | CL_EXPIRE; if (user_ns != mnt_ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE; + copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { up_write(&namespace_sem); @@ -2567,7 +2553,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, struct mount *new_mnt, *root_mnt; int error; - if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!may_mount()) return -EPERM; error = user_path_dir(new_root, &new); @@ -2753,6 +2739,51 @@ bool our_mnt(struct vfsmount *mnt) return check_mnt(real_mount(mnt)); } +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + +void update_mnt_policy(struct user_namespace *userns) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + switch (mnt->mnt.mnt_sb->s_magic) { + case SYSFS_MAGIC: + userns->may_mount_sysfs = true; + break; + case PROC_SUPER_MAGIC: + userns->may_mount_proc = true; + break; + } + if (userns->may_mount_sysfs && userns->may_mount_proc) + break; + } + up_read(&namespace_sem); +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 4117e7b377bb..816326093656 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -593,14 +593,10 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, return 1; /* I'm not sure */ qname.name = __name; - qname.hash = full_name_hash(qname.name, qname.len); - - if (dentry->d_op && dentry->d_op->d_hash) - if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0) - goto end_advance; - - newdent = d_lookup(dentry, &qname); + newdent = d_hash_and_lookup(dentry, &qname); + if (unlikely(IS_ERR(newdent))) + goto end_advance; if (!newdent) { newdent = d_alloc(dentry, &qname); if (!newdent) diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 1acdad7fcec7..26910c8154da 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -331,12 +331,15 @@ static int ncp_show_options(struct seq_file *seq, struct dentry *root) struct ncp_server *server = NCP_SBP(root->d_sb); unsigned int tmp; - if (server->m.uid != 0) - seq_printf(seq, ",uid=%u", server->m.uid); - if (server->m.gid != 0) - seq_printf(seq, ",gid=%u", server->m.gid); - if (server->m.mounted_uid != 0) - seq_printf(seq, ",owner=%u", server->m.mounted_uid); + if (!uid_eq(server->m.uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",uid=%u", + from_kuid_munged(&init_user_ns, server->m.uid)); + if (!gid_eq(server->m.gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", + from_kgid_munged(&init_user_ns, server->m.gid)); + if (!uid_eq(server->m.mounted_uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",owner=%u", + from_kuid_munged(&init_user_ns, server->m.mounted_uid)); tmp = server->m.file_mode & S_IALLUGO; if (tmp != NCP_DEFAULT_FILE_MODE) seq_printf(seq, ",mode=0%o", tmp); @@ -381,13 +384,13 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) data->flags = 0; data->int_flags = 0; - data->mounted_uid = 0; + data->mounted_uid = GLOBAL_ROOT_UID; data->wdog_pid = NULL; data->ncp_fd = ~0; data->time_out = NCP_DEFAULT_TIME_OUT; data->retry_count = NCP_DEFAULT_RETRY_COUNT; - data->uid = 0; - data->gid = 0; + data->uid = GLOBAL_ROOT_UID; + data->gid = GLOBAL_ROOT_GID; data->file_mode = NCP_DEFAULT_FILE_MODE; data->dir_mode = NCP_DEFAULT_DIR_MODE; data->info_fd = -1; @@ -399,13 +402,19 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) goto err; switch (optval) { case 'u': - data->uid = optint; + data->uid = make_kuid(current_user_ns(), optint); + if (!uid_valid(data->uid)) + goto err; break; case 'g': - data->gid = optint; + data->gid = make_kgid(current_user_ns(), optint); + if (!gid_valid(data->gid)) + goto err; break; case 'o': - data->mounted_uid = optint; + data->mounted_uid = make_kuid(current_user_ns(), optint); + if (!uid_valid(data->mounted_uid)) + goto err; break; case 'm': data->file_mode = optint; @@ -480,13 +489,13 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) data.flags = md->flags; data.int_flags = NCP_IMOUNT_LOGGEDIN_POSSIBLE; - data.mounted_uid = md->mounted_uid; + data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid); data.wdog_pid = find_get_pid(md->wdog_pid); data.ncp_fd = md->ncp_fd; data.time_out = md->time_out; data.retry_count = md->retry_count; - data.uid = md->uid; - data.gid = md->gid; + data.uid = make_kuid(current_user_ns(), md->uid); + data.gid = make_kgid(current_user_ns(), md->gid); data.file_mode = md->file_mode; data.dir_mode = md->dir_mode; data.info_fd = -1; @@ -499,13 +508,13 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data; data.flags = md->flags; - data.mounted_uid = md->mounted_uid; + data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid); data.wdog_pid = find_get_pid(md->wdog_pid); data.ncp_fd = md->ncp_fd; data.time_out = md->time_out; data.retry_count = md->retry_count; - data.uid = md->uid; - data.gid = md->gid; + data.uid = make_kuid(current_user_ns(), md->uid); + data.gid = make_kgid(current_user_ns(), md->gid); data.file_mode = md->file_mode; data.dir_mode = md->dir_mode; data.info_fd = -1; @@ -520,12 +529,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) goto out; break; } + error = -EINVAL; + if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) || + !gid_valid(data.gid)) + goto out; error = -EBADF; ncp_filp = fget(data.ncp_fd); if (!ncp_filp) goto out; error = -ENOTSOCK; - sock_inode = ncp_filp->f_path.dentry->d_inode; + sock_inode = file_inode(ncp_filp); if (!S_ISSOCK(sock_inode->i_mode)) goto out_fput; sock = SOCKET_I(sock_inode); @@ -564,7 +577,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (!server->info_filp) goto out_bdi; error = -ENOTSOCK; - sock_inode = server->info_filp->f_path.dentry->d_inode; + sock_inode = file_inode(server->info_filp); if (!S_ISSOCK(sock_inode->i_mode)) goto out_fput2; info_sock = SOCKET_I(sock_inode); @@ -886,12 +899,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) goto out; result = -EPERM; - if (((attr->ia_valid & ATTR_UID) && - (attr->ia_uid != server->m.uid))) + if ((attr->ia_valid & ATTR_UID) && !uid_eq(attr->ia_uid, server->m.uid)) goto out; - if (((attr->ia_valid & ATTR_GID) && - (attr->ia_gid != server->m.gid))) + if ((attr->ia_valid & ATTR_GID) && !gid_eq(attr->ia_gid, server->m.gid)) goto out; if (((attr->ia_valid & ATTR_MODE) && @@ -1040,6 +1051,7 @@ static struct file_system_type ncp_fs_type = { .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("ncpfs"); static int __init init_ncp_fs(void) { diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 6958adfaff08..60426ccb3b65 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -45,7 +45,7 @@ ncp_get_fs_info(struct ncp_server * server, struct inode *inode, return -EINVAL; } /* TODO: info.addr = server->m.serv_addr; */ - SET_UID(info.mounted_uid, server->m.mounted_uid); + SET_UID(info.mounted_uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid)); info.connection = server->connection; info.buffer_size = server->buffer_size; info.volume_number = NCP_FINFO(inode)->volNumber; @@ -69,7 +69,7 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode, DPRINTK("info.version invalid: %d\n", info2.version); return -EINVAL; } - info2.mounted_uid = server->m.mounted_uid; + info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); info2.connection = server->connection; info2.buffer_size = server->buffer_size; info2.volume_number = NCP_FINFO(inode)->volNumber; @@ -135,7 +135,7 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode, DPRINTK("info.version invalid: %d\n", info2.version); return -EINVAL; } - info2.mounted_uid = server->m.mounted_uid; + info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); info2.connection = server->connection; info2.buffer_size = server->buffer_size; info2.volume_number = NCP_FINFO(inode)->volNumber; @@ -348,22 +348,25 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg { u16 uid; - SET_UID(uid, server->m.mounted_uid); + SET_UID(uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid)); if (put_user(uid, (u16 __user *)argp)) return -EFAULT; return 0; } case NCP_IOC_GETMOUNTUID32: - if (put_user(server->m.mounted_uid, - (u32 __user *)argp)) + { + uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); + if (put_user(uid, (u32 __user *)argp)) return -EFAULT; return 0; + } case NCP_IOC_GETMOUNTUID64: - if (put_user(server->m.mounted_uid, - (u64 __user *)argp)) + { + uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); + if (put_user(uid, (u64 __user *)argp)) return -EFAULT; return 0; - + } case NCP_IOC_GETROOT: { struct ncp_setroot_ioctl sr; @@ -808,9 +811,9 @@ outrel: long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct ncp_server *server = NCP_SERVER(inode); - uid_t uid = current_uid(); + kuid_t uid = current_uid(); int need_drop_write = 0; long ret; @@ -819,12 +822,12 @@ long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NCP_IOC_CONN_LOGGED_IN: case NCP_IOC_SETROOT: if (!capable(CAP_SYS_ADMIN)) { - ret = -EACCES; + ret = -EPERM; goto out; } break; } - if (server->m.mounted_uid != uid) { + if (!uid_eq(server->m.mounted_uid, uid)) { switch (cmd) { /* * Only mount owner can issue these ioctls. Information diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 63d14a99483d..ee24df5af1f9 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -105,7 +105,7 @@ static const struct vm_operations_struct ncp_file_mmap = /* This is used for a general mmap of a ncp file */ int ncp_mmap(struct file *file, struct vm_area_struct *vma) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); DPRINTK("ncp_mmap: called\n"); diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index 54cc0cdb3dcb..c51b2c543539 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -23,15 +23,15 @@ struct ncp_mount_data_kernel { unsigned long flags; /* NCP_MOUNT_* flags */ unsigned int int_flags; /* internal flags */ #define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001 - uid_t mounted_uid; /* Who may umount() this filesystem? */ + kuid_t mounted_uid; /* Who may umount() this filesystem? */ struct pid *wdog_pid; /* Who cares for our watchdog packets? */ unsigned int ncp_fd; /* The socket to the ncp port */ unsigned int time_out; /* How long should I wait after sending a NCP request? */ unsigned int retry_count; /* And how often should I retry? */ unsigned char mounted_vol[NCP_VOLNAME_LEN + 1]; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; umode_t file_mode; umode_t dir_mode; int info_fd; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 4fa788c93f46..434b93ec0970 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -1273,6 +1273,7 @@ static const struct nfs_pageio_ops bl_pg_write_ops = { static struct pnfs_layoutdriver_type blocklayout_type = { .id = LAYOUT_BLOCK_VOLUME, .name = "LAYOUT_BLOCK_VOLUME", + .owner = THIS_MODULE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, .alloc_layout_hdr = bl_alloc_layout_hdr, diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c index 737d839bc17b..6fc7b5cae92b 100644 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -55,7 +55,8 @@ static void dev_remove(struct net *net, dev_t dev) bl_pipe_msg.bl_wq = &nn->bl_wq; memset(msg, 0, sizeof(*msg)); - msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); + msg->len = sizeof(bl_msg) + bl_msg.totallen; + msg->data = kzalloc(msg->len, GFP_NOFS); if (!msg->data) goto out; @@ -66,7 +67,6 @@ static void dev_remove(struct net *net, dev_t dev) memcpy(msg->data, &bl_msg, sizeof(bl_msg)); dataptr = (uint8_t *) msg->data; memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); - msg->len = sizeof(bl_msg) + bl_msg.totallen; add_wait_queue(&nn->bl_wq, &wq); if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 862a2f16db64..5f7b053720ee 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -128,10 +128,13 @@ int nfs_cache_register_net(struct net *net, struct cache_detail *cd) struct super_block *pipefs_sb; int ret = 0; + sunrpc_init_cache_detail(cd); pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { ret = nfs_cache_register_sb(pipefs_sb, cd); rpc_put_sb_net(net); + if (ret) + sunrpc_destroy_cache_detail(cd); } return ret; } @@ -151,14 +154,5 @@ void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd) nfs_cache_unregister_sb(pipefs_sb, cd); rpc_put_sb_net(net); } -} - -void nfs_cache_init(struct cache_detail *cd) -{ - sunrpc_init_cache_detail(cd); -} - -void nfs_cache_destroy(struct cache_detail *cd) -{ sunrpc_destroy_cache_detail(cd); } diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 317db95e37f8..4116d2c3f52f 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -23,8 +23,6 @@ extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void); extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq); extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); -extern void nfs_cache_init(struct cache_detail *cd); -extern void nfs_cache_destroy(struct cache_detail *cd); extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd); extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd); extern int nfs_cache_register_sb(struct super_block *sb, diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 264d1aa935f2..2960512792c2 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -183,60 +183,15 @@ static u32 initiate_file_draining(struct nfs_client *clp, static u32 initiate_bulk_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { - struct nfs_server *server; - struct pnfs_layout_hdr *lo; - struct inode *ino; - u32 rv = NFS4ERR_NOMATCHING_LAYOUT; - struct pnfs_layout_hdr *tmp; - LIST_HEAD(recall_list); - LIST_HEAD(free_me_list); - struct pnfs_layout_range range = { - .iomode = IOMODE_ANY, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; - - spin_lock(&clp->cl_lock); - rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - if ((args->cbl_recall_type == RETURN_FSID) && - memcmp(&server->fsid, &args->cbl_fsid, - sizeof(struct nfs_fsid))) - continue; + int stat; - list_for_each_entry(lo, &server->layouts, plh_layouts) { - ino = igrab(lo->plh_inode); - if (!ino) - continue; - spin_lock(&ino->i_lock); - /* Is this layout in the process of being freed? */ - if (NFS_I(ino)->layout != lo) { - spin_unlock(&ino->i_lock); - iput(ino); - continue; - } - pnfs_get_layout_hdr(lo); - spin_unlock(&ino->i_lock); - list_add(&lo->plh_bulk_recall, &recall_list); - } - } - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - - list_for_each_entry_safe(lo, tmp, - &recall_list, plh_bulk_recall) { - ino = lo->plh_inode; - spin_lock(&ino->i_lock); - set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); - if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range)) - rv = NFS4ERR_DELAY; - list_del_init(&lo->plh_bulk_recall); - spin_unlock(&ino->i_lock); - pnfs_free_lseg_list(&free_me_list); - pnfs_put_layout_hdr(lo); - iput(ino); - } - return rv; + if (args->cbl_recall_type == RETURN_FSID) + stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true); + else + stat = pnfs_destroy_layouts_byclid(clp, true); + if (stat != 0) + return NFS4ERR_DELAY; + return NFS4ERR_NOMATCHING_LAYOUT; } static u32 do_callback_layoutrecall(struct nfs_client *clp, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 9f3c66438d0e..84d8eae203a7 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -197,7 +197,6 @@ error_0: EXPORT_SYMBOL_GPL(nfs_alloc_client); #if IS_ENABLED(CONFIG_NFS_V4) -/* idr_remove_all is not needed as all id's are removed by nfs_put_client */ void nfs_cleanup_cb_ident_idr(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 81c5eec3cf38..6390a4b5fee7 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -55,7 +55,8 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags) flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL && (delegation->type & flags) == flags) { + if (delegation != NULL && (delegation->type & flags) == flags && + !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { nfs_mark_delegation_referenced(delegation); ret = 1; } @@ -70,8 +71,10 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ int status = 0; if (inode->i_flock == NULL) - goto out; + return 0; + if (inode->i_flock == NULL) + goto out; /* Protect inode->i_flock using the file locks lock */ lock_flocks(); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { @@ -94,7 +97,9 @@ static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *s { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; + struct nfs4_state_owner *sp; struct nfs4_state *state; + unsigned int seq; int err; again: @@ -109,9 +114,16 @@ again: continue; get_nfs_open_context(ctx); spin_unlock(&inode->i_lock); + sp = state->owner; + /* Block nfs4_proc_unlck */ + mutex_lock(&sp->so_delegreturn_mutex); + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); err = nfs4_open_delegation_recall(ctx, state, stateid); - if (err >= 0) + if (!err) err = nfs_delegation_claim_locks(ctx, state); + if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) + err = -EAGAIN; + mutex_unlock(&sp->so_delegreturn_mutex); put_nfs_open_context(ctx); if (err != 0) return err; @@ -182,39 +194,91 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation } static struct nfs_delegation * +nfs_start_delegation_return_locked(struct nfs_inode *nfsi) +{ + struct nfs_delegation *ret = NULL; + struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); + + if (delegation == NULL) + goto out; + spin_lock(&delegation->lock); + if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + ret = delegation; + spin_unlock(&delegation->lock); +out: + return ret; +} + +static struct nfs_delegation * +nfs_start_delegation_return(struct nfs_inode *nfsi) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = nfs_start_delegation_return_locked(nfsi); + rcu_read_unlock(); + return delegation; +} + +static void +nfs_abort_delegation_return(struct nfs_delegation *delegation, + struct nfs_client *clp) +{ + + spin_lock(&delegation->lock); + clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + spin_unlock(&delegation->lock); + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); +} + +static struct nfs_delegation * nfs_detach_delegation_locked(struct nfs_inode *nfsi, - struct nfs_server *server) + struct nfs_delegation *delegation, + struct nfs_client *clp) { - struct nfs_delegation *delegation = + struct nfs_delegation *deleg_cur = rcu_dereference_protected(nfsi->delegation, - lockdep_is_held(&server->nfs_client->cl_lock)); + lockdep_is_held(&clp->cl_lock)); - if (delegation == NULL) - goto nomatch; + if (deleg_cur == NULL || delegation != deleg_cur) + return NULL; spin_lock(&delegation->lock); + set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); list_del_rcu(&delegation->super_list); delegation->inode = NULL; nfsi->delegation_state = 0; rcu_assign_pointer(nfsi->delegation, NULL); spin_unlock(&delegation->lock); return delegation; -nomatch: - return NULL; } static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi, - struct nfs_server *server) + struct nfs_delegation *delegation, + struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; - struct nfs_delegation *delegation; spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(nfsi, server); + delegation = nfs_detach_delegation_locked(nfsi, delegation, clp); spin_unlock(&clp->cl_lock); return delegation; } +static struct nfs_delegation * +nfs_inode_detach_delegation(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_delegation *delegation; + + delegation = nfs_start_delegation_return(nfsi); + if (delegation == NULL) + return NULL; + return nfs_detach_delegation(nfsi, delegation, server); +} + /** * nfs_inode_set_delegation - set up a delegation on an inode * @inode: inode to which delegation applies @@ -268,7 +332,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation = NULL; goto out; } - freeme = nfs_detach_delegation_locked(nfsi, server); + freeme = nfs_detach_delegation_locked(nfsi, + old_delegation, clp); + if (freeme == NULL) + goto out; } list_add_rcu(&delegation->super_list, &server->delegations); nfsi->delegation_state = delegation->type; @@ -292,19 +359,29 @@ out: /* * Basic procedure for returning a delegation to the server */ -static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) +static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); int err; - /* - * Guard against new delegated open/lock/unlock calls and against - * state recovery - */ - down_write(&nfsi->rwsem); - err = nfs_delegation_claim_opens(inode, &delegation->stateid); - up_write(&nfsi->rwsem); - if (err) + if (delegation == NULL) + return 0; + do { + err = nfs_delegation_claim_opens(inode, &delegation->stateid); + if (!issync || err != -EAGAIN) + break; + /* + * Guard against state recovery + */ + err = nfs4_wait_clnt_recover(clp); + } while (err == 0); + + if (err) { + nfs_abort_delegation_return(delegation, clp); + goto out; + } + if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode))) goto out; err = nfs_do_return_delegation(inode, delegation, issync); @@ -340,13 +417,10 @@ restart: inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) continue; - delegation = nfs_detach_delegation(NFS_I(inode), - server); + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); - if (delegation != NULL) - err = __nfs_inode_return_delegation(inode, - delegation, 0); + err = nfs_end_delegation_return(inode, delegation, 0); iput(inode); if (!err) goto restart; @@ -367,15 +441,11 @@ restart: */ void nfs_inode_return_delegation_noreclaim(struct inode *inode) { - struct nfs_server *server = NFS_SERVER(inode); - struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - if (rcu_access_pointer(nfsi->delegation) != NULL) { - delegation = nfs_detach_delegation(nfsi, server); - if (delegation != NULL) - nfs_do_return_delegation(inode, delegation, 0); - } + delegation = nfs_inode_detach_delegation(inode); + if (delegation != NULL) + nfs_do_return_delegation(inode, delegation, 0); } /** @@ -390,18 +460,14 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode) */ int nfs4_inode_return_delegation(struct inode *inode) { - struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; int err = 0; nfs_wb_all(inode); - if (rcu_access_pointer(nfsi->delegation) != NULL) { - delegation = nfs_detach_delegation(nfsi, server); - if (delegation != NULL) { - err = __nfs_inode_return_delegation(inode, delegation, 1); - } - } + delegation = nfs_start_delegation_return(nfsi); + if (delegation != NULL) + err = nfs_end_delegation_return(inode, delegation, 1); return err; } @@ -471,7 +537,7 @@ void nfs_remove_bad_delegation(struct inode *inode) { struct nfs_delegation *delegation; - delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode)); + delegation = nfs_inode_detach_delegation(inode); if (delegation) { nfs_inode_find_state_and_recover(inode, &delegation->stateid); nfs_free_delegation(delegation); @@ -649,7 +715,7 @@ restart: if (inode == NULL) continue; delegation = nfs_detach_delegation(NFS_I(inode), - server); + delegation, server); rcu_read_unlock(); if (delegation != NULL) diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index bbc6a4dba0d8..d54d4fca6793 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -29,6 +29,7 @@ enum { NFS_DELEGATION_NEED_RECLAIM = 0, NFS_DELEGATION_RETURN, NFS_DELEGATION_REFERENCED, + NFS_DELEGATION_RETURNING, }; int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 1b2d7eb93796..f23f455be42b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -281,7 +281,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des for (i = 0; i < array->size; i++) { if (array->array[i].cookie == *desc->dir_cookie) { - struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode); + struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); struct nfs_open_dir_context *ctx = desc->file->private_data; new_pos = desc->current_index + i; @@ -629,7 +629,7 @@ out: static int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) { - struct inode *inode = desc->file->f_path.dentry->d_inode; + struct inode *inode = file_inode(desc->file); int ret; ret = nfs_readdir_xdr_to_array(desc, page, inode); @@ -660,7 +660,7 @@ void cache_page_release(nfs_readdir_descriptor_t *desc) static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, + return read_cache_page(file_inode(desc->file)->i_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); } @@ -764,7 +764,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, { struct page *page = NULL; int status; - struct inode *inode = desc->file->f_path.dentry->d_inode; + struct inode *inode = file_inode(desc->file); struct nfs_open_dir_context *ctx = desc->file->private_data; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", @@ -1136,6 +1136,45 @@ out_error: } /* + * A weaker form of d_revalidate for revalidating just the dentry->d_inode + * when we don't really care about the dentry name. This is called when a + * pathwalk ends on a dentry that was not found via a normal lookup in the + * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals). + * + * In this situation, we just want to verify that the inode itself is OK + * since the dentry might have changed on the server. + */ +static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ + int error; + struct inode *inode = dentry->d_inode; + + /* + * I believe we can only get a negative dentry here in the case of a + * procfs-style symlink. Just assume it's correct for now, but we may + * eventually need to do something more here. + */ + if (!inode) { + dfprintk(LOOKUPCACHE, "%s: %s/%s has negative inode\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name); + return 1; + } + + if (is_bad_inode(inode)) { + dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name); + return 0; + } + + error = nfs_revalidate_inode(NFS_SERVER(inode), inode); + dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", + __func__, inode->i_ino, error ? "invalid" : "valid"); + return !error; +} + +/* * This is called from dput() when d_count is going to 0. */ static int nfs_dentry_delete(const struct dentry *dentry) @@ -1202,6 +1241,7 @@ static void nfs_d_release(struct dentry *dentry) const struct dentry_operations nfs_dentry_operations = { .d_revalidate = nfs_lookup_revalidate, + .d_weak_revalidate = nfs_weak_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, .d_automount = nfs_d_automount, diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index ca4b11ec87a2..945527092295 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/dns_resolver.h> #include "dns_resolve.h" @@ -42,6 +43,7 @@ EXPORT_SYMBOL_GPL(nfs_dns_resolve_name); #include <linux/seq_file.h> #include <linux/inet.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/rpc_pipe_fs.h> @@ -142,7 +144,7 @@ static int nfs_dns_upcall(struct cache_detail *cd, ret = nfs_cache_upcall(cd, key->hostname); if (ret) - ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request); + ret = sunrpc_cache_pipe_upcall(cd, ch); return ret; } @@ -351,60 +353,47 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, } EXPORT_SYMBOL_GPL(nfs_dns_resolve_name); +static struct cache_detail nfs_dns_resolve_template = { + .owner = THIS_MODULE, + .hash_size = NFS_DNS_HASHTBL_SIZE, + .name = "dns_resolve", + .cache_put = nfs_dns_ent_put, + .cache_upcall = nfs_dns_upcall, + .cache_request = nfs_dns_request, + .cache_parse = nfs_dns_parse, + .cache_show = nfs_dns_show, + .match = nfs_dns_match, + .init = nfs_dns_ent_init, + .update = nfs_dns_ent_update, + .alloc = nfs_dns_ent_alloc, +}; + + int nfs_dns_resolver_cache_init(struct net *net) { - int err = -ENOMEM; + int err; struct nfs_net *nn = net_generic(net, nfs_net_id); - struct cache_detail *cd; - struct cache_head **tbl; - cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL); - if (cd == NULL) - goto err_cd; - - tbl = kzalloc(NFS_DNS_HASHTBL_SIZE * sizeof(struct cache_head *), - GFP_KERNEL); - if (tbl == NULL) - goto err_tbl; - - cd->owner = THIS_MODULE, - cd->hash_size = NFS_DNS_HASHTBL_SIZE, - cd->hash_table = tbl, - cd->name = "dns_resolve", - cd->cache_put = nfs_dns_ent_put, - cd->cache_upcall = nfs_dns_upcall, - cd->cache_parse = nfs_dns_parse, - cd->cache_show = nfs_dns_show, - cd->match = nfs_dns_match, - cd->init = nfs_dns_ent_init, - cd->update = nfs_dns_ent_update, - cd->alloc = nfs_dns_ent_alloc, - - nfs_cache_init(cd); - err = nfs_cache_register_net(net, cd); + nn->nfs_dns_resolve = cache_create_net(&nfs_dns_resolve_template, net); + if (IS_ERR(nn->nfs_dns_resolve)) + return PTR_ERR(nn->nfs_dns_resolve); + + err = nfs_cache_register_net(net, nn->nfs_dns_resolve); if (err) goto err_reg; - nn->nfs_dns_resolve = cd; return 0; err_reg: - nfs_cache_destroy(cd); - kfree(cd->hash_table); -err_tbl: - kfree(cd); -err_cd: + cache_destroy_net(nn->nfs_dns_resolve, net); return err; } void nfs_dns_resolver_cache_destroy(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); - struct cache_detail *cd = nn->nfs_dns_resolve; - nfs_cache_unregister_net(net, cd); - nfs_cache_destroy(cd); - kfree(cd->hash_table); - kfree(cd); + nfs_cache_unregister_net(net, nn->nfs_dns_resolve); + cache_destroy_net(nn->nfs_dns_resolve, net); } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 3c2b893665ba..29f4a48a0ee6 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -292,7 +292,7 @@ static int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { int ret; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 033803c36644..44efaa8c5f78 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -126,8 +126,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, } spin_unlock(&ret->d_lock); out: - if (name) - kfree(name); + kfree(name); nfs_free_fattr(fsinfo.fattr); return ret; } diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index bc3968fa81e5..c516da5873fd 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -97,7 +97,7 @@ static void nfs_fattr_free_group_name(struct nfs_fattr *fattr) static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr) { struct nfs4_string *owner = fattr->owner_name; - __u32 uid; + kuid_t uid; if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)) return false; @@ -111,7 +111,7 @@ static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr) { struct nfs4_string *group = fattr->group_name; - __u32 gid; + kgid_t gid; if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)) return false; @@ -193,7 +193,8 @@ static int nfs_idmap_init_keyring(void) if (!cred) return -ENOMEM; - keyring = keyring_alloc(".id_resolver", 0, 0, cred, + keyring = keyring_alloc(".id_resolver", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_NOT_IN_QUOTA, NULL); @@ -725,9 +726,9 @@ out1: return ret; } -static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data) +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen) { - return key_instantiate_and_link(key, data, strlen(data) + 1, + return key_instantiate_and_link(key, data, datalen, id_resolver_cache->thread_keyring, authkey); } @@ -737,6 +738,7 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, struct key *key, struct key *authkey) { char id_str[NFS_UINT_MAXLEN]; + size_t len; int ret = -ENOKEY; /* ret = -ENOKEY */ @@ -746,13 +748,15 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, case IDMAP_CONV_NAMETOID: if (strcmp(upcall->im_name, im->im_name) != 0) break; - sprintf(id_str, "%d", im->im_id); - ret = nfs_idmap_instantiate(key, authkey, id_str); + /* Note: here we store the NUL terminator too */ + len = sprintf(id_str, "%d", im->im_id) + 1; + ret = nfs_idmap_instantiate(key, authkey, id_str, len); break; case IDMAP_CONV_IDTONAME: if (upcall->im_id != im->im_id) break; - ret = nfs_idmap_instantiate(key, authkey, im->im_name); + len = strlen(im->im_name); + ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); break; default: ret = -EINVAL; @@ -764,7 +768,7 @@ out: static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { - struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); + struct rpc_inode *rpci = RPC_I(file_inode(filp)); struct idmap *idmap = (struct idmap *)rpci->private; struct key_construction *cons; struct idmap_msg im; @@ -836,43 +840,61 @@ idmap_release_pipe(struct inode *inode) nfs_idmap_abort_pipe_upcall(idmap, -EPIPE); } -int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid) { struct idmap *idmap = server->nfs_client->cl_idmap; + __u32 id = -1; + int ret = 0; - if (nfs_map_string_to_numeric(name, namelen, uid)) - return 0; - return nfs_idmap_lookup_id(name, namelen, "uid", uid, idmap); + if (!nfs_map_string_to_numeric(name, namelen, &id)) + ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap); + if (ret == 0) { + *uid = make_kuid(&init_user_ns, id); + if (!uid_valid(*uid)) + ret = -ERANGE; + } + return ret; } -int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid) { struct idmap *idmap = server->nfs_client->cl_idmap; + __u32 id = -1; + int ret = 0; - if (nfs_map_string_to_numeric(name, namelen, gid)) - return 0; - return nfs_idmap_lookup_id(name, namelen, "gid", gid, idmap); + if (!nfs_map_string_to_numeric(name, namelen, &id)) + ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap); + if (ret == 0) { + *gid = make_kgid(&init_user_ns, id); + if (!gid_valid(*gid)) + ret = -ERANGE; + } + return ret; } -int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) +int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen) { struct idmap *idmap = server->nfs_client->cl_idmap; int ret = -EINVAL; + __u32 id; + id = from_kuid(&init_user_ns, uid); if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) - ret = nfs_idmap_lookup_name(uid, "user", buf, buflen, idmap); + ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); if (ret < 0) - ret = nfs_map_numeric_to_string(uid, buf, buflen); + ret = nfs_map_numeric_to_string(id, buf, buflen); return ret; } -int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen) +int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen) { struct idmap *idmap = server->nfs_client->cl_idmap; int ret = -EINVAL; + __u32 id; + id = from_kgid(&init_user_ns, gid); if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) - ret = nfs_idmap_lookup_name(gid, "group", buf, buflen, idmap); + ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); if (ret < 0) - ret = nfs_map_numeric_to_string(gid, buf, buflen); + ret = nfs_map_numeric_to_string(id, buf, buflen); return ret; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index ebeb94ce1b0b..1f941674b089 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -237,6 +237,8 @@ nfs_find_actor(struct inode *inode, void *opaque) if (NFS_FILEID(inode) != fattr->fileid) return 0; + if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode)) + return 0; if (nfs_compare_fh(NFS_FH(inode), fh)) return 0; if (is_bad_inode(inode) || NFS_STALE(inode)) @@ -332,8 +334,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_version = 0; inode->i_size = 0; clear_nlink(inode); - inode->i_uid = -2; - inode->i_gid = -2; + inode->i_uid = make_kuid(&init_user_ns, -2); + inode->i_gid = make_kgid(&init_user_ns, -2); inode->i_blocks = 0; memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); nfsi->write_io = 0; @@ -694,10 +696,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) if (ctx->cred != NULL) put_rpccred(ctx->cred); dput(ctx->dentry); - if (is_sync) - nfs_sb_deactive(sb); - else - nfs_sb_deactive_async(sb); + nfs_sb_deactive(sb); kfree(ctx->mdsthreshold); kfree(ctx); } @@ -714,7 +713,7 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context); */ void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct nfs_inode *nfsi = NFS_I(inode); filp->private_data = get_nfs_open_context(ctx); @@ -747,7 +746,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c static void nfs_file_clear_open_context(struct file *filp) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct nfs_open_context *ctx = nfs_file_open_context(filp); if (ctx) { @@ -1009,9 +1008,9 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; - if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid) + if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; - if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid) + if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid)) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; /* Has the link count changed? */ @@ -1440,7 +1439,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_OWNER) { - if (inode->i_uid != fattr->uid) { + if (!uid_eq(inode->i_uid, fattr->uid)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; } @@ -1451,7 +1450,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_GROUP) { - if (inode->i_gid != fattr->gid) { + if (!gid_eq(inode->i_gid, fattr->gid)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index f0e6c7df1a07..541c9ebdbc5a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -329,7 +329,6 @@ extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern void nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); -extern void nfs_sb_deactive_async(struct super_block *sb); /* namespace.c */ #define NFS_PATH_CANONICAL 1 diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index dd057bc6b65b..fc8dc20fdeb9 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -177,11 +177,31 @@ out_nofree: return mnt; } +static int +nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + if (NFS_FH(dentry->d_inode)->size != 0) + return nfs_getattr(mnt, dentry, stat); + generic_fillattr(dentry->d_inode, stat); + return 0; +} + +static int +nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) +{ + if (NFS_FH(dentry->d_inode)->size != 0) + return nfs_setattr(dentry, attr); + return -EACCES; +} + const struct inode_operations nfs_mountpoint_inode_operations = { .getattr = nfs_getattr, + .setattr = nfs_setattr, }; const struct inode_operations nfs_referral_inode_operations = { + .getattr = nfs_namespace_getattr, + .setattr = nfs_namespace_setattr, }; static void nfs_expire_automounts(struct work_struct *work) diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 06b9df49f7f7..62db136339ea 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -290,8 +290,13 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->mode = be32_to_cpup(p++); fattr->nlink = be32_to_cpup(p++); - fattr->uid = be32_to_cpup(p++); - fattr->gid = be32_to_cpup(p++); + fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); + if (!uid_valid(fattr->uid)) + goto out_uid; + fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); + if (!gid_valid(fattr->gid)) + goto out_gid; + fattr->size = be32_to_cpup(p++); fattr->du.nfs2.blocksize = be32_to_cpup(p++); @@ -313,6 +318,12 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime); return 0; +out_uid: + dprintk("NFS: returned invalid uid\n"); + return -EINVAL; +out_gid: + dprintk("NFS: returned invalid gid\n"); + return -EINVAL; out_overflow: print_overflow_msg(__func__, xdr); return -EIO; @@ -351,11 +362,11 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_UID) - *p++ = cpu_to_be32(attr->ia_uid); + *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_GID) - *p++ = cpu_to_be32(attr->ia_gid); + *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_SIZE) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 70efb63b1e42..43ea96ced28c 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -872,7 +872,7 @@ static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess static int nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index bffc32406fbf..fa6d72131c19 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -592,13 +592,13 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) if (attr->ia_valid & ATTR_UID) { *p++ = xdr_one; - *p++ = cpu_to_be32(attr->ia_uid); + *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); } else *p++ = xdr_zero; if (attr->ia_valid & ATTR_GID) { *p++ = xdr_one; - *p++ = cpu_to_be32(attr->ia_gid); + *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); } else *p++ = xdr_zero; @@ -657,8 +657,12 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode; fattr->nlink = be32_to_cpup(p++); - fattr->uid = be32_to_cpup(p++); - fattr->gid = be32_to_cpup(p++); + fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); + if (!uid_valid(fattr->uid)) + goto out_uid; + fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); + if (!gid_valid(fattr->gid)) + goto out_gid; p = xdr_decode_size3(p, &fattr->size); p = xdr_decode_size3(p, &fattr->du.nfs3.used); @@ -675,6 +679,12 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->valid |= NFS_ATTR_FATTR_V3; return 0; +out_uid: + dprintk("NFS: returned invalid uid\n"); + return -EINVAL; +out_gid: + dprintk("NFS: returned invalid gid\n"); + return -EINVAL; out_overflow: print_overflow_msg(__func__, xdr); return -EIO; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index a3f488b074a2..944c9a5c1039 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -13,6 +13,8 @@ #define NFS4_MAX_LOOP_ON_RECOVER (10) +#include <linux/seqlock.h> + struct idmap; enum nfs4_client_state { @@ -90,6 +92,8 @@ struct nfs4_state_owner { unsigned long so_flags; struct list_head so_states; struct nfs_seqid_counter so_seqid; + seqcount_t so_reclaim_seqcount; + struct mutex so_delegreturn_mutex; }; enum { diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index acc347268124..ac4fc9a8fdbc 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -6,6 +6,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_idmap.h> #include <linux/nfs_mount.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/bc_xprt.h> @@ -29,15 +30,14 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) if (clp->rpc_ops->version != 4 || minorversion != 0) return ret; -retry: - if (!idr_pre_get(&nn->cb_ident_idr, GFP_KERNEL)) - return -ENOMEM; + idr_preload(GFP_KERNEL); spin_lock(&nn->nfs_client_lock); - ret = idr_get_new(&nn->cb_ident_idr, clp, &clp->cl_cb_ident); + ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT); + if (ret >= 0) + clp->cl_cb_ident = ret; spin_unlock(&nn->nfs_client_lock); - if (ret == -EAGAIN) - goto retry; - return ret; + idr_preload_end(); + return ret < 0 ? ret : 0; } #ifdef CONFIG_NFS_V4_1 @@ -236,11 +236,10 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; + nfs_put_client(clp); if (clp != old) { clp->cl_preserve_clid = true; - nfs_put_client(clp); clp = old; - atomic_inc(&clp->cl_count); } return clp; @@ -306,7 +305,7 @@ int nfs40_walk_client_list(struct nfs_client *new, .clientid = new->cl_clientid, .confirm = new->cl_confirm, }; - int status; + int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { @@ -332,40 +331,33 @@ int nfs40_walk_client_list(struct nfs_client *new, if (prev) nfs_put_client(prev); + prev = pos; status = nfs4_proc_setclientid_confirm(pos, &clid, cred); - if (status == 0) { + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + break; + case 0: nfs4_swap_callback_idents(pos, new); - nfs_put_client(pos); + prev = NULL; *result = pos; dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); - return 0; - } - if (status != -NFS4ERR_STALE_CLIENTID) { - nfs_put_client(pos); - dprintk("NFS: <-- %s status = %d, no result\n", - __func__, status); - return status; + default: + goto out; } spin_lock(&nn->nfs_client_lock); - prev = pos; } + spin_unlock(&nn->nfs_client_lock); - /* - * No matching nfs_client found. This should be impossible, - * because the new nfs_client has already been added to - * nfs_client_list by nfs_get_client(). - * - * Don't BUG(), since the caller is holding a mutex. - */ + /* No match found. The server lost our clientid */ +out: if (prev) nfs_put_client(prev); - spin_unlock(&nn->nfs_client_lock); - pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); - return -NFS4ERR_STALE_CLIENTID; + dprintk("NFS: <-- %s status = %d\n", __func__, status); + return status; } #ifdef CONFIG_NFS_V4_1 @@ -432,7 +424,7 @@ int nfs41_walk_client_list(struct nfs_client *new, { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); struct nfs_client *pos, *n, *prev = NULL; - int error; + int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { @@ -448,14 +440,17 @@ int nfs41_walk_client_list(struct nfs_client *new, nfs_put_client(prev); prev = pos; - error = nfs_wait_client_init_complete(pos); - if (error < 0) { + nfs4_schedule_lease_recovery(pos); + status = nfs_wait_client_init_complete(pos); + if (status < 0) { nfs_put_client(pos); spin_lock(&nn->nfs_client_lock); continue; } - + status = pos->cl_cons_state; spin_lock(&nn->nfs_client_lock); + if (status < 0) + continue; } if (pos->rpc_ops != new->rpc_ops) @@ -473,6 +468,7 @@ int nfs41_walk_client_list(struct nfs_client *new, if (!nfs4_match_serverowners(pos, new)) continue; + atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); @@ -481,16 +477,10 @@ int nfs41_walk_client_list(struct nfs_client *new, return 0; } - /* - * No matching nfs_client found. This should be impossible, - * because the new nfs_client has already been added to - * nfs_client_list by nfs_get_client(). - * - * Don't BUG(), since the caller is holding a mutex. - */ + /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); - pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); - return -NFS4ERR_STALE_CLIENTID; + dprintk("NFS: <-- %s status = %d\n", __func__, status); + return status; } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 08ddcccb8887..13e6bb3e3fe5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -94,7 +94,7 @@ static int nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { int ret; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 194c48410336..4fb234d3aefb 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -99,7 +99,8 @@ static void filelayout_reset_write(struct nfs_write_data *data) task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } } @@ -119,7 +120,8 @@ static void filelayout_reset_read(struct nfs_read_data *data) task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } } @@ -127,7 +129,6 @@ static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo) { if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return; - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags); pnfs_return_layout(inode); } diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 8c07241fe52b..b8da95548d3d 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -36,7 +36,7 @@ * Default data server connection timeout and retrans vaules. * Set by module paramters dataserver_timeo and dataserver_retrans. */ -#define NFS4_DEF_DS_TIMEO 60 +#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ #define NFS4_DEF_DS_RETRANS 5 /* diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index b720064bcd7f..1fe284f01f8b 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -31,6 +31,7 @@ #include <linux/nfs_fs.h> #include <linux/vmalloc.h> #include <linux/module.h> +#include <linux/sunrpc/addr.h> #include "internal.h" #include "nfs4session.h" diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 1e09eb78543b..0dd766079e1c 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/vfs.h> #include <linux/inet.h> #include "internal.h" diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cf747ef86650..26431cf62ddb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -93,6 +93,8 @@ static int nfs4_map_errors(int err) return err; switch (err) { case -NFS4ERR_RESOURCE: + case -NFS4ERR_LAYOUTTRYLATER: + case -NFS4ERR_RECALLCONFLICT: return -EREMOTEIO; case -NFS4ERR_WRONGSEC: return -EPERM; @@ -896,6 +898,8 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) return 0; if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) return 0; + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + return 0; nfs_mark_delegation_referenced(delegation); return 1; } @@ -973,6 +977,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat spin_lock(&deleg_cur->lock); if (nfsi->delegation != deleg_cur || + test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) || (deleg_cur->type & fmode) != fmode) goto no_delegation_unlock; @@ -1155,6 +1160,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) data->o_arg.fmode); iput(inode); out: + nfs_release_seqid(data->o_arg.seqid); return state; err_put_inode: iput(inode); @@ -1352,19 +1358,18 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: + set_bit(NFS_DELEGATED_STATE, &state->flags); nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); + err = -EAGAIN; goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: + set_bit(NFS_DELEGATED_STATE, &state->flags); case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ nfs4_schedule_lease_recovery(server->nfs_client); + err = -EAGAIN; goto out; - case -ERESTARTSYS: - /* - * The show must go on: exit, but mark the - * stateid as needing recovery. - */ case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: @@ -1375,6 +1380,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state err = 0; goto out; } + set_bit(NFS_DELEGATED_STATE, &state->flags); err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); out: @@ -1463,7 +1469,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) struct nfs4_state_owner *sp = data->owner; if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) - return; + goto out_wait; /* * Check if we still need to send an OPEN call, or if we can use * a delegation instead. @@ -1498,6 +1504,7 @@ unlock_no_action: rcu_read_unlock(); out_no_action: task->tk_action = NULL; +out_wait: nfs4_sequence_done(task, &data->o_res.seq_res); } @@ -1845,6 +1852,43 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct sattr->ia_valid |= ATTR_MTIME; } +static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, + fmode_t fmode, + int flags, + struct nfs4_state **res) +{ + struct nfs4_state_owner *sp = opendata->owner; + struct nfs_server *server = sp->so_server; + struct nfs4_state *state; + unsigned int seq; + int ret; + + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + + ret = _nfs4_proc_open(opendata); + if (ret != 0) + goto out; + + state = nfs4_opendata_to_nfs4_state(opendata); + ret = PTR_ERR(state); + if (IS_ERR(state)) + goto out; + if (server->caps & NFS_CAP_POSIX_LOCK) + set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + + ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); + if (ret != 0) + goto out; + + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { + nfs4_schedule_stateid_recovery(server, state); + nfs4_wait_clnt_recover(server->nfs_client); + } + *res = state; +out: + return ret; +} + /* * Returns a referenced nfs4_state */ @@ -1889,18 +1933,7 @@ static int _nfs4_do_open(struct inode *dir, if (dentry->d_inode != NULL) opendata->state = nfs4_get_open_state(dentry->d_inode, sp); - status = _nfs4_proc_open(opendata); - if (status != 0) - goto err_opendata_put; - - state = nfs4_opendata_to_nfs4_state(opendata); - status = PTR_ERR(state); - if (IS_ERR(state)) - goto err_opendata_put; - if (server->caps & NFS_CAP_POSIX_LOCK) - set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); - - status = nfs4_opendata_access(cred, opendata, state, fmode, flags); + status = _nfs4_open_and_get_state(opendata, fmode, flags, &state); if (status != 0) goto err_opendata_put; @@ -2088,7 +2121,7 @@ static void nfs4_free_closedata(void *data) nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); nfs4_put_state_owner(sp); - nfs_sb_deactive_async(sb); + nfs_sb_deactive(sb); kfree(calldata); } @@ -2150,7 +2183,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) dprintk("%s: begin!\n", __func__); if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) - return; + goto out_wait; task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; calldata->arg.fmode = FMODE_READ|FMODE_WRITE; @@ -2172,16 +2205,14 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (!call_close) { /* Note: exit _without_ calling nfs4_close_done */ - task->tk_action = NULL; - nfs4_sequence_done(task, &calldata->res.seq_res); - goto out; + goto out_no_action; } if (calldata->arg.fmode == 0) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; if (calldata->roc && pnfs_roc_drain(inode, &calldata->roc_barrier, task)) - goto out; + goto out_wait; } nfs_fattr_init(calldata->res.fattr); @@ -2191,8 +2222,12 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) &calldata->res.seq_res, task) != 0) nfs_release_seqid(calldata->arg.seqid); -out: dprintk("%s: done!\n", __func__); + return; +out_no_action: + task->tk_action = NULL; +out_wait: + nfs4_sequence_done(task, &calldata->res.seq_res); } static const struct rpc_call_ops nfs4_close_ops = { @@ -2597,7 +2632,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, int status; if (pnfs_ld_layoutret_on_setattr(inode)) - pnfs_return_layout(inode); + pnfs_commit_and_return_layout(inode); nfs_fattr_init(fattr); @@ -4423,12 +4458,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) struct nfs4_unlockdata *calldata = data; if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) - return; + goto out_wait; if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ - task->tk_action = NULL; - nfs4_sequence_done(task, &calldata->res.seq_res); - return; + goto out_no_action; } calldata->timestamp = jiffies; if (nfs4_setup_sequence(calldata->server, @@ -4436,6 +4469,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) &calldata->res.seq_res, task) != 0) nfs_release_seqid(calldata->arg.seqid); + return; +out_no_action: + task->tk_action = NULL; +out_wait: + nfs4_sequence_done(task, &calldata->res.seq_res); } static const struct rpc_call_ops nfs4_locku_ops = { @@ -4482,7 +4520,9 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs_inode *nfsi = NFS_I(state->inode); + struct inode *inode = state->inode; + struct nfs4_state_owner *sp = state->owner; + struct nfs_inode *nfsi = NFS_I(inode); struct nfs_seqid *seqid; struct nfs4_lock_state *lsp; struct rpc_task *task; @@ -4492,12 +4532,17 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = nfs4_set_lock_state(state, request); /* Unlock _before_ we do the RPC call */ request->fl_flags |= FL_EXISTS; + /* Exclude nfs_delegation_claim_locks() */ + mutex_lock(&sp->so_delegreturn_mutex); + /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ down_read(&nfsi->rwsem); if (do_vfs_lock(request->fl_file, request) == -ENOENT) { up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); goto out; } up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); if (status != 0) goto out; /* Is this a delegated lock? */ @@ -4576,7 +4621,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) dprintk("%s: begin!\n", __func__); if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) - return; + goto out_wait; /* Do we need to do an open_to_lock_owner? */ if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { @@ -4596,6 +4641,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) nfs_release_seqid(data->arg.open_seqid); out_release_lock_seqid: nfs_release_seqid(data->arg.lock_seqid); +out_wait: + nfs4_sequence_done(task, &data->res.seq_res); dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); } @@ -4813,8 +4860,10 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { + struct nfs4_state_owner *sp = state->owner; struct nfs_inode *nfsi = NFS_I(state->inode); unsigned char fl_flags = request->fl_flags; + unsigned int seq; int status = -ENOLCK; if ((fl_flags & FL_POSIX) && @@ -4836,9 +4885,16 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock status = do_vfs_lock(request->fl_file, request); goto out_unlock; } + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + up_read(&nfsi->rwsem); status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); if (status != 0) + goto out; + down_read(&nfsi->rwsem); + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { + status = -NFS4ERR_DELAY; goto out_unlock; + } /* Note: we always want to sleep here! */ request->fl_flags = fl_flags | FL_SLEEP; if (do_vfs_lock(request->fl_file, request) < 0) @@ -4945,24 +5001,22 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case 0: case -ESTALE: goto out; - case -NFS4ERR_EXPIRED: - nfs4_schedule_stateid_recovery(server, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: + set_bit(NFS_DELEGATED_STATE, &state->flags); + case -NFS4ERR_EXPIRED: nfs4_schedule_lease_recovery(server->nfs_client); + err = -EAGAIN; goto out; case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: + set_bit(NFS_DELEGATED_STATE, &state->flags); nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); + err = -EAGAIN; goto out; - case -ERESTARTSYS: - /* - * The show must go on: exit, but mark the - * stateid as needing recovery. - */ case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: @@ -4975,9 +5029,8 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) /* kill_proc(fl->fl_pid, SIGLOST, 1); */ err = 0; goto out; - case -NFS4ERR_DELAY: - break; } + set_bit(NFS_DELEGATED_STATE, &state->flags); err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); out: @@ -5995,6 +6048,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; struct nfs4_state *state = NULL; + unsigned long timeo, giveup; dprintk("--> %s\n", __func__); @@ -6006,7 +6060,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) goto out; case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: - task->tk_status = -NFS4ERR_DELAY; + timeo = rpc_get_timeout(task->tk_client); + giveup = lgp->args.timestamp + timeo; + if (time_after(giveup, jiffies)) + task->tk_status = -NFS4ERR_DELAY; break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: @@ -6079,11 +6136,13 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) static void nfs4_layoutget_release(void *calldata) { struct nfs4_layoutget *lgp = calldata; - struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); size_t max_pages = max_response_pages(server); dprintk("--> %s\n", __func__); nfs4_free_pages(lgp->args.layout.pages, max_pages); + pnfs_put_layout_hdr(NFS_I(inode)->layout); put_nfs_open_context(lgp->args.ctx); kfree(calldata); dprintk("<-- %s\n", __func__); @@ -6098,7 +6157,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { struct pnfs_layout_segment * nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) { - struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); size_t max_pages = max_response_pages(server); struct rpc_task *task; struct rpc_message msg = { @@ -6124,17 +6184,23 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) return ERR_PTR(-ENOMEM); } lgp->args.layout.pglen = max_pages * PAGE_SIZE; + lgp->args.timestamp = jiffies; lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); + + /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ + pnfs_get_layout_hdr(NFS_I(inode)->layout); + task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return ERR_CAST(task); status = nfs4_wait_for_completion_rpc_task(task); if (status == 0) status = task->tk_status; - if (status == 0) + /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ + if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); @@ -6350,22 +6416,8 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) static void nfs4_layoutcommit_release(void *calldata) { struct nfs4_layoutcommit_data *data = calldata; - struct pnfs_layout_segment *lseg, *tmp; - unsigned long *bitlock = &NFS_I(data->args.inode)->flags; pnfs_cleanup_layoutcommit(data); - /* Matched by references in pnfs_set_layoutcommit */ - list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { - list_del_init(&lseg->pls_lc_list); - if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, - &lseg->pls_flags)) - pnfs_put_lseg(lseg); - } - - clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); - wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); - put_rpccred(data->cred); kfree(data); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9448c579d41a..6ace365c6334 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -136,16 +136,11 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, clp->cl_confirm = clid.confirm; status = nfs40_walk_client_list(clp, result, cred); - switch (status) { - case -NFS4ERR_STALE_CLIENTID: - set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - case 0: + if (status == 0) { /* Sustain the lease, even if it's empty. If the clientid4 * goes stale it's of no use for trunking discovery. */ nfs4_schedule_state_renewal(*result); - break; } - out: return status; } @@ -523,6 +518,8 @@ nfs4_alloc_state_owner(struct nfs_server *server, nfs4_init_seqid_counter(&sp->so_seqid); atomic_set(&sp->so_count, 1); INIT_LIST_HEAD(&sp->so_lru); + seqcount_init(&sp->so_reclaim_seqcount); + mutex_init(&sp->so_delegreturn_mutex); return sp; } @@ -1395,8 +1392,9 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs * recovering after a network partition or a reboot from a * server that doesn't support a grace period. */ -restart: spin_lock(&sp->so_lock); + write_seqcount_begin(&sp->so_reclaim_seqcount); +restart: list_for_each_entry(state, &sp->so_states, open_states) { if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) continue; @@ -1417,6 +1415,7 @@ restart: } spin_unlock(&state->state_lock); nfs4_put_open_state(state); + spin_lock(&sp->so_lock); goto restart; } } @@ -1454,12 +1453,17 @@ restart: goto out_err; } nfs4_put_open_state(state); + spin_lock(&sp->so_lock); goto restart; } + write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); return 0; out_err: nfs4_put_open_state(state); + spin_lock(&sp->so_lock); + write_seqcount_end(&sp->so_reclaim_seqcount); + spin_unlock(&sp->so_lock); return status; } @@ -1863,6 +1867,7 @@ again: case -ETIMEDOUT: case -EAGAIN: ssleep(1); + case -NFS4ERR_STALE_CLIENTID: dprintk("NFS: %s after status %d, retrying\n", __func__, status); goto again; @@ -2022,8 +2027,18 @@ static int nfs4_reset_session(struct nfs_client *clp) nfs4_begin_drain_session(clp); cred = nfs4_get_exchange_id_cred(clp); status = nfs4_proc_destroy_session(clp->cl_session, cred); - if (status && status != -NFS4ERR_BADSESSION && - status != -NFS4ERR_DEADSESSION) { + switch (status) { + case 0: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + break; + case -NFS4ERR_BACK_CHAN_BUSY: + case -NFS4ERR_DELAY: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + status = 0; + ssleep(1); + goto out; + default: status = nfs4_recovery_handle_error(clp, status); goto out; } diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 84d2e9e2f313..569b166cc050 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -28,7 +28,7 @@ static struct file_system_type nfs4_remote_fs_type = { .name = "nfs4", .mount = nfs4_remote_mount, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; static struct file_system_type nfs4_remote_referral_fs_type = { @@ -36,7 +36,7 @@ static struct file_system_type nfs4_remote_referral_fs_type = { .name = "nfs4", .mount = nfs4_remote_referral_mount, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; struct file_system_type nfs4_referral_fs_type = { @@ -44,7 +44,7 @@ struct file_system_type nfs4_referral_fs_type = { .name = "nfs4", .mount = nfs4_referral_mount, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; static const struct super_operations nfs4_sops = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 26b143920433..e3edda554ac7 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1002,7 +1002,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { dprintk("nfs: couldn't resolve uid %d to string\n", - iap->ia_uid); + from_kuid(&init_user_ns, iap->ia_uid)); /* XXX */ strcpy(owner_name, "nobody"); owner_namelen = sizeof("nobody") - 1; @@ -1014,7 +1014,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ); if (owner_grouplen < 0) { dprintk("nfs: couldn't resolve gid %d to string\n", - iap->ia_gid); + from_kgid(&init_user_ns, iap->ia_gid)); strcpy(owner_group, "nobody"); owner_grouplen = sizeof("nobody") - 1; /* goto out; */ @@ -3778,14 +3778,14 @@ out_overflow: } static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, - const struct nfs_server *server, uint32_t *uid, + const struct nfs_server *server, kuid_t *uid, struct nfs4_string *owner_name) { uint32_t len; __be32 *p; int ret = 0; - *uid = -2; + *uid = make_kuid(&init_user_ns, -2); if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { @@ -3813,7 +3813,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, __func__, len); bitmap[1] &= ~FATTR4_WORD1_OWNER; } - dprintk("%s: uid=%d\n", __func__, (int)*uid); + dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid)); return ret; out_overflow: print_overflow_msg(__func__, xdr); @@ -3821,14 +3821,14 @@ out_overflow: } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, - const struct nfs_server *server, uint32_t *gid, + const struct nfs_server *server, kgid_t *gid, struct nfs4_string *group_name) { uint32_t len; __be32 *p; int ret = 0; - *gid = -2; + *gid = make_kgid(&init_user_ns, -2); if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { @@ -3856,7 +3856,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, __func__, len); bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; } - dprintk("%s: gid=%d\n", __func__, (int)*gid); + dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid)); return ret; out_overflow: print_overflow_msg(__func__, xdr); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index c6f990656f89..88f9611a945c 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -647,6 +647,7 @@ static struct pnfs_layoutdriver_type objlayout_type = { .flags = PNFS_LAYOUTRET_ON_SETATTR | PNFS_LAYOUTRET_ON_ERROR, + .owner = THIS_MODULE, .alloc_layout_hdr = objlayout_alloc_layout_hdr, .free_layout_hdr = objlayout_free_layout_hdr, diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d00260b08103..4bdffe0ba025 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -417,6 +417,16 @@ should_free_lseg(struct pnfs_layout_range *lseg_range, lo_seg_intersecting(lseg_range, recall_range); } +static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + if (!atomic_dec_and_test(&lseg->pls_refcount)) + return false; + pnfs_layout_remove_lseg(lseg->pls_layout, lseg); + list_add(&lseg->pls_list, tmp_list); + return true; +} + /* Returns 1 if lseg is removed from list, 0 otherwise */ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) @@ -430,11 +440,8 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, */ dprintk("%s: lseg %p ref %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount)); - if (atomic_dec_and_test(&lseg->pls_refcount)) { - pnfs_layout_remove_lseg(lseg->pls_layout, lseg); - list_add(&lseg->pls_list, tmp_list); + if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) rv = 1; - } } return rv; } @@ -505,37 +512,147 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); -/* - * Called by the state manger to remove all layouts established under an - * expired lease. - */ -void -pnfs_destroy_all_layouts(struct nfs_client *clp) +static bool +pnfs_layout_add_bulk_destroy_list(struct inode *inode, + struct list_head *layout_list) { - struct nfs_server *server; struct pnfs_layout_hdr *lo; - LIST_HEAD(tmp_list); + bool ret = false; - nfs4_deviceid_mark_client_invalid(clp); - nfs4_deviceid_purge_client(clp); + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) { + pnfs_get_layout_hdr(lo); + list_add(&lo->plh_bulk_destroy, layout_list); + ret = true; + } + spin_unlock(&inode->i_lock); + return ret; +} + +/* Caller must hold rcu_read_lock and clp->cl_lock */ +static int +pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, + struct nfs_server *server, + struct list_head *layout_list) +{ + struct pnfs_layout_hdr *lo, *next; + struct inode *inode; + + list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { + inode = igrab(lo->plh_inode); + if (inode == NULL) + continue; + list_del_init(&lo->plh_layouts); + if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) + continue; + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + iput(inode); + spin_lock(&clp->cl_lock); + rcu_read_lock(); + return -EAGAIN; + } + return 0; +} + +static int +pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, + bool is_bulk_recall) +{ + struct pnfs_layout_hdr *lo; + struct inode *inode; + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(lseg_list); + int ret = 0; + + while (!list_empty(layout_list)) { + lo = list_entry(layout_list->next, struct pnfs_layout_hdr, + plh_bulk_destroy); + dprintk("%s freeing layout for inode %lu\n", __func__, + lo->plh_inode->i_ino); + inode = lo->plh_inode; + spin_lock(&inode->i_lock); + list_del_init(&lo->plh_bulk_destroy); + lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ + if (is_bulk_recall) + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) + ret = -EAGAIN; + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&lseg_list); + pnfs_put_layout_hdr(lo); + iput(inode); + } + return ret; +} + +int +pnfs_destroy_layouts_byfsid(struct nfs_client *clp, + struct nfs_fsid *fsid, + bool is_recall) +{ + struct nfs_server *server; + LIST_HEAD(layout_list); spin_lock(&clp->cl_lock); rcu_read_lock(); +restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - if (!list_empty(&server->layouts)) - list_splice_init(&server->layouts, &tmp_list); + if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0) + continue; + if (pnfs_layout_bulk_destroy_byserver_locked(clp, + server, + &layout_list) != 0) + goto restart; } rcu_read_unlock(); spin_unlock(&clp->cl_lock); - while (!list_empty(&tmp_list)) { - lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, - plh_layouts); - dprintk("%s freeing layout for inode %lu\n", __func__, - lo->plh_inode->i_ino); - list_del_init(&lo->plh_layouts); - pnfs_destroy_layout(NFS_I(lo->plh_inode)); + if (list_empty(&layout_list)) + return 0; + return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +int +pnfs_destroy_layouts_byclid(struct nfs_client *clp, + bool is_recall) +{ + struct nfs_server *server; + LIST_HEAD(layout_list); + + spin_lock(&clp->cl_lock); + rcu_read_lock(); +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (pnfs_layout_bulk_destroy_byserver_locked(clp, + server, + &layout_list) != 0) + goto restart; } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + + if (list_empty(&layout_list)) + return 0; + return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +/* + * Called by the state manger to remove all layouts established under an + * expired lease. + */ +void +pnfs_destroy_all_layouts(struct nfs_client *clp) +{ + nfs4_deviceid_mark_client_invalid(clp); + nfs4_deviceid_purge_client(clp); + + pnfs_destroy_layouts_byclid(clp, false); } /* @@ -667,6 +784,21 @@ send_layoutget(struct pnfs_layout_hdr *lo, return lseg; } +static void pnfs_clear_layoutcommit(struct inode *inode, + struct list_head *head) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct pnfs_layout_segment *lseg, *tmp; + + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + return; + list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { + if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + continue; + pnfs_lseg_dec_and_remove_zero(lseg, head); + } +} + /* * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr * when the layout segment list is empty. @@ -698,6 +830,7 @@ _pnfs_return_layout(struct inode *ino) /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); + pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { @@ -710,8 +843,6 @@ _pnfs_return_layout(struct inode *ino) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)); - lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; @@ -735,6 +866,33 @@ out: } EXPORT_SYMBOL_GPL(_pnfs_return_layout); +int +pnfs_commit_and_return_layout(struct inode *inode) +{ + struct pnfs_layout_hdr *lo; + int ret; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo == NULL) { + spin_unlock(&inode->i_lock); + return 0; + } + pnfs_get_layout_hdr(lo); + /* Block new layoutgets and read/write to ds */ + lo->plh_block_lgets++; + spin_unlock(&inode->i_lock); + filemap_fdatawait(inode->i_mapping); + ret = pnfs_layoutcommit_inode(inode, true); + if (ret == 0) + ret = _pnfs_return_layout(inode); + spin_lock(&inode->i_lock); + lo->plh_block_lgets--; + spin_unlock(&inode->i_lock); + pnfs_put_layout_hdr(lo); + return ret; +} + bool pnfs_roc(struct inode *ino) { struct pnfs_layout_hdr *lo; @@ -888,7 +1046,7 @@ alloc_init_layout_hdr(struct inode *ino, atomic_set(&lo->plh_refcount, 1); INIT_LIST_HEAD(&lo->plh_layouts); INIT_LIST_HEAD(&lo->plh_segs); - INIT_LIST_HEAD(&lo->plh_bulk_recall); + INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); return lo; @@ -1071,7 +1229,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_client *clp = server->nfs_client; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg = NULL; - bool first = false; + bool first; if (!pnfs_enabled_sb(NFS_SERVER(ino))) goto out; @@ -1105,10 +1263,9 @@ pnfs_update_layout(struct inode *ino, goto out_unlock; atomic_inc(&lo->plh_outstanding); - if (list_empty(&lo->plh_segs)) - first = true; - + first = list_empty(&lo->plh_layouts) ? true : false; spin_unlock(&ino->i_lock); + if (first) { /* The lo must be on the clp list if there is any * chance of a CB_LAYOUTRECALL(FILE) coming in. @@ -1312,13 +1469,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops) + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor pgio; LIST_HEAD(failed); /* Resend all requests through the MDS */ nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); + pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1347,13 +1506,13 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data) dprintk("pnfs write error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } /* @@ -1468,13 +1627,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops) + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor pgio; LIST_HEAD(failed); /* Resend all requests through the MDS */ nfs_pageio_init_read(&pgio, inode, compl_ops); + pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1499,13 +1660,13 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) dprintk("pnfs read error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } /* @@ -1631,11 +1792,27 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { if (lseg->pls_range.iomode == IOMODE_RW && - test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) list_add(&lseg->pls_lc_list, listp); } } +static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) +{ + struct pnfs_layout_segment *lseg, *tmp; + unsigned long *bitlock = &NFS_I(inode)->flags; + + /* Matched by references in pnfs_set_layoutcommit */ + list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { + list_del_init(&lseg->pls_lc_list); + pnfs_put_lseg(lseg); + } + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} + void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) { pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); @@ -1680,6 +1857,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) if (nfss->pnfs_curr_ld->cleanup_layoutcommit) nfss->pnfs_curr_ld->cleanup_layoutcommit(data); + pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list); } /* diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index dbf7bba52da0..f5f8a470a647 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -132,7 +132,7 @@ struct pnfs_layoutdriver_type { struct pnfs_layout_hdr { atomic_t plh_refcount; struct list_head plh_layouts; /* other client layouts */ - struct list_head plh_bulk_recall; /* clnt list of bulk recalls */ + struct list_head plh_bulk_destroy; struct list_head plh_segs; /* layout segments list */ nfs4_stateid plh_stateid; atomic_t plh_outstanding; /* number of RPCs out */ @@ -196,6 +196,11 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); +int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, + struct nfs_fsid *fsid, + bool is_recall); +int pnfs_destroy_layouts_byclid(struct nfs_client *clp, + bool is_recall); void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, @@ -214,6 +219,7 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); +int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_write_data *); void pnfs_ld_read_done(struct nfs_read_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, @@ -225,9 +231,11 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops); + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq); int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops); + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); /* nfs4_deviceid_flags */ @@ -400,6 +408,11 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } +static inline int pnfs_commit_and_return_layout(struct inode *inode) +{ + return 0; +} + static inline bool pnfs_ld_layoutret_on_setattr(struct inode *inode) { diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index d35b62e83ea6..6da209bd9408 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -77,9 +77,8 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld, long hash) { struct nfs4_deviceid_node *d; - struct hlist_node *n; - hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) + hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node) if (d->ld == ld && d->nfs_client == clp && !memcmp(&d->deviceid, id, sizeof(*id))) { if (atomic_read(&d->ref)) @@ -248,12 +247,11 @@ static void _deviceid_purge_client(const struct nfs_client *clp, long hash) { struct nfs4_deviceid_node *d; - struct hlist_node *n; HLIST_HEAD(tmp); spin_lock(&nfs4_deviceid_lock); rcu_read_lock(); - hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) + hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node) if (d->nfs_client == clp && atomic_read(&d->ref)) { hlist_del_init_rcu(&d->node); hlist_add_head(&d->tmpnode, &tmp); @@ -291,12 +289,11 @@ void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) { struct nfs4_deviceid_node *d; - struct hlist_node *n; int i; rcu_read_lock(); for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){ - hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[i], node) + hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[i], node) if (d->nfs_client == clp) set_bit(NFS_DEVICEID_INVALID, &d->flags); } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index f084dac948e1..fc8de9016acf 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -662,7 +662,7 @@ nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) static int nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2e7e8c878e5d..2f8a29db0f1b 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -31,6 +31,7 @@ #include <linux/errno.h> #include <linux/unistd.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/xprtsock.h> @@ -54,7 +55,6 @@ #include <linux/parser.h> #include <linux/nsproxy.h> #include <linux/rcupdate.h> -#include <linux/kthread.h> #include <asm/uaccess.h> @@ -292,8 +292,9 @@ struct file_system_type nfs_fs_type = { .name = "nfs", .mount = nfs_fs_mount, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("nfs"); EXPORT_SYMBOL_GPL(nfs_fs_type); struct file_system_type nfs_xdev_fs_type = { @@ -301,7 +302,7 @@ struct file_system_type nfs_xdev_fs_type = { .name = "nfs", .mount = nfs_xdev_mount, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; const struct super_operations nfs_sops = { @@ -331,8 +332,10 @@ struct file_system_type nfs4_fs_type = { .name = "nfs4", .mount = nfs_fs_mount, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("nfs4"); +MODULE_ALIAS("nfs4"); EXPORT_SYMBOL_GPL(nfs4_fs_type); static int __init register_nfs4_fs(void) @@ -418,54 +421,6 @@ void nfs_sb_deactive(struct super_block *sb) } EXPORT_SYMBOL_GPL(nfs_sb_deactive); -static int nfs_deactivate_super_async_work(void *ptr) -{ - struct super_block *sb = ptr; - - deactivate_super(sb); - module_put_and_exit(0); - return 0; -} - -/* - * same effect as deactivate_super, but will do final unmount in kthread - * context - */ -static void nfs_deactivate_super_async(struct super_block *sb) -{ - struct task_struct *task; - char buf[INET6_ADDRSTRLEN + 1]; - struct nfs_server *server = NFS_SB(sb); - struct nfs_client *clp = server->nfs_client; - - if (!atomic_add_unless(&sb->s_active, -1, 1)) { - rcu_read_lock(); - snprintf(buf, sizeof(buf), - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); - rcu_read_unlock(); - - __module_get(THIS_MODULE); - task = kthread_run(nfs_deactivate_super_async_work, sb, - "%s-deactivate-super", buf); - if (IS_ERR(task)) { - pr_err("%s: kthread_run: %ld\n", - __func__, PTR_ERR(task)); - /* make synchronous call and hope for the best */ - deactivate_super(sb); - module_put(THIS_MODULE); - } - } -} - -void nfs_sb_deactive_async(struct super_block *sb) -{ - struct nfs_server *server = NFS_SB(sb); - - if (atomic_dec_and_test(&server->active)) - nfs_deactivate_super_async(sb); -} -EXPORT_SYMBOL_GPL(nfs_sb_deactive_async); - /* * Deliver file system statistics to userspace */ @@ -2589,27 +2544,23 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags, struct nfs_server *server; struct dentry *mntroot = ERR_PTR(-ENOMEM); struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod; - int error; - dprintk("--> nfs_xdev_mount_common()\n"); + dprintk("--> nfs_xdev_mount()\n"); mount_info.mntfh = mount_info.cloned->fh; /* create a new volume representation */ server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); - if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out_err; - } - mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, nfs_mod); - dprintk("<-- nfs_xdev_mount_common() = 0\n"); -out: - return mntroot; + if (IS_ERR(server)) + mntroot = ERR_CAST(server); + else + mntroot = nfs_fs_mount_common(server, flags, + dev_name, &mount_info, nfs_mod); -out_err: - dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error); - goto out; + dprintk("<-- nfs_xdev_mount() = %ld\n", + IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L); + return mntroot; } #if IS_ENABLED(CONFIG_NFS_V4) @@ -2769,6 +2720,5 @@ module_param(send_implementation_id, ushort, 0644); MODULE_PARM_DESC(send_implementation_id, "Send implementation ID with NFSv4.1 exchange_id"); MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string"); -MODULE_ALIAS("nfs4"); #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 3f79c77153b8..1f1f38f0c5d5 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata) nfs_dec_sillycount(data->dir); nfs_free_unlinkdata(data); - nfs_sb_deactive_async(sb); + nfs_sb_deactive(sb); } static void nfs_unlink_prepare(struct rpc_task *task, void *calldata) @@ -268,8 +268,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) * point dentry is definitely not a root, so we won't need * that anymore. */ - if (devname_garbage) - kfree(devname_garbage); + kfree(devname_garbage); return 0; out_unlock: spin_unlock(&dentry->d_lock); @@ -336,20 +335,14 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct inode *old_dir = data->old_dir; struct inode *new_dir = data->new_dir; struct dentry *old_dentry = data->old_dentry; - struct dentry *new_dentry = data->new_dentry; if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { rpc_restart_call_prepare(task); return; } - if (task->tk_status != 0) { + if (task->tk_status != 0) nfs_cancel_async_unlink(old_dentry); - return; - } - - d_drop(old_dentry); - d_drop(new_dentry); } /** @@ -550,6 +543,18 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) error = rpc_wait_for_completion_task(task); if (error == 0) error = task->tk_status; + switch (error) { + case 0: + /* The rename succeeded */ + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + d_move(dentry, sdentry); + break; + case -ERESTARTSYS: + /* The result of the rename is unknown. Play it safe by + * forcing a new lookup */ + d_drop(dentry); + d_drop(sdentry); + } rpc_put_task(task); out_dput: dput(sdentry); diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index 6940439bd609..ed628f71274c 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -38,8 +38,8 @@ struct nfsacl_encode_desc { unsigned int count; struct posix_acl *acl; int typeflag; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; }; struct nfsacl_simple_acl { @@ -60,14 +60,16 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem) *p++ = htonl(entry->e_tag | nfsacl_desc->typeflag); switch(entry->e_tag) { case ACL_USER_OBJ: - *p++ = htonl(nfsacl_desc->uid); + *p++ = htonl(from_kuid(&init_user_ns, nfsacl_desc->uid)); break; case ACL_GROUP_OBJ: - *p++ = htonl(nfsacl_desc->gid); + *p++ = htonl(from_kgid(&init_user_ns, nfsacl_desc->gid)); break; case ACL_USER: + *p++ = htonl(from_kuid(&init_user_ns, entry->e_uid)); + break; case ACL_GROUP: - *p++ = htonl(entry->e_id); + *p++ = htonl(from_kgid(&init_user_ns, entry->e_gid)); break; default: /* Solaris depends on that! */ *p++ = 0; @@ -148,6 +150,7 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem) (struct nfsacl_decode_desc *) desc; __be32 *p = elem; struct posix_acl_entry *entry; + unsigned int id; if (!nfsacl_desc->acl) { if (desc->array_len > NFS_ACL_MAX_ENTRIES) @@ -160,14 +163,22 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem) entry = &nfsacl_desc->acl->a_entries[nfsacl_desc->count++]; entry->e_tag = ntohl(*p++) & ~NFS_ACL_DEFAULT; - entry->e_id = ntohl(*p++); + id = ntohl(*p++); entry->e_perm = ntohl(*p++); switch(entry->e_tag) { - case ACL_USER_OBJ: case ACL_USER: - case ACL_GROUP_OBJ: + entry->e_uid = make_kuid(&init_user_ns, id); + if (!uid_valid(entry->e_uid)) + return -EINVAL; + break; case ACL_GROUP: + entry->e_gid = make_kgid(&init_user_ns, id); + if (!gid_valid(entry->e_gid)) + return -EINVAL; + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: case ACL_OTHER: if (entry->e_perm & ~S_IRWXO) return -EINVAL; @@ -190,9 +201,13 @@ cmp_acl_entry(const void *x, const void *y) if (a->e_tag != b->e_tag) return a->e_tag - b->e_tag; - else if (a->e_id > b->e_id) + else if ((a->e_tag == ACL_USER) && uid_gt(a->e_uid, b->e_uid)) + return 1; + else if ((a->e_tag == ACL_USER) && uid_lt(a->e_uid, b->e_uid)) + return -1; + else if ((a->e_tag == ACL_GROUP) && gid_gt(a->e_gid, b->e_gid)) return 1; - else if (a->e_id < b->e_id) + else if ((a->e_tag == ACL_GROUP) && gid_lt(a->e_gid, b->e_gid)) return -1; else return 0; @@ -213,22 +228,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl) sort(acl->a_entries, acl->a_count, sizeof(struct posix_acl_entry), cmp_acl_entry, NULL); - /* Clear undefined identifier fields and find the ACL_GROUP_OBJ - and ACL_MASK entries. */ + /* Find the ACL_GROUP_OBJ and ACL_MASK entries. */ FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { case ACL_USER_OBJ: - pa->e_id = ACL_UNDEFINED_ID; break; case ACL_GROUP_OBJ: - pa->e_id = ACL_UNDEFINED_ID; group_obj = pa; break; case ACL_MASK: mask = pa; /* fall through */ case ACL_OTHER: - pa->e_id = ACL_UNDEFINED_ID; break; } } diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 8df1ea4a6ff9..430b6872806f 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -65,8 +65,8 @@ config NFSD_V3_ACL If unsure, say N. config NFSD_V4 - bool "NFS server support for NFS version 4 (EXPERIMENTAL)" - depends on NFSD && PROC_FS && EXPERIMENTAL + bool "NFS server support for NFS version 4" + depends on NFSD && PROC_FS select NFSD_V3 select FS_POSIX_ACL select SUNRPC_GSS diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index 34e5c40af5ef..8b186a4955cc 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -44,8 +44,6 @@ struct nfs4_acl *nfs4_acl_new(int); int nfs4_acl_get_whotype(char *, u32); int nfs4_acl_write_who(int who, char *p); -int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group, - uid_t who, u32 mask); #define NFS4_ACL_TYPE_DEFAULT 0x01 #define NFS4_ACL_DIR 0x02 diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 34a10d78b839..06cddd572264 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -47,9 +47,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) if (!gi) goto oom; } else if (flags & NFSEXP_ROOTSQUASH) { - if (!new->fsuid) + if (uid_eq(new->fsuid, GLOBAL_ROOT_UID)) new->fsuid = exp->ex_anon_uid; - if (!new->fsgid) + if (gid_eq(new->fsgid, GLOBAL_ROOT_GID)) new->fsgid = exp->ex_anon_gid; gi = groups_alloc(rqgi->ngroups); @@ -58,7 +58,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) for (i = 0; i < rqgi->ngroups; i++) { if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i))) - GROUP_AT(gi, i) = make_kgid(&init_user_ns, exp->ex_anon_gid); + GROUP_AT(gi, i) = exp->ex_anon_gid; else GROUP_AT(gi, i) = GROUP_AT(rqgi, i); } @@ -66,9 +66,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) gi = get_group_info(rqgi); } - if (new->fsuid == (uid_t) -1) + if (uid_eq(new->fsuid, INVALID_UID)) new->fsuid = exp->ex_anon_uid; - if (new->fsgid == (gid_t) -1) + if (gid_eq(new->fsgid, INVALID_GID)) new->fsgid = exp->ex_anon_gid; ret = set_groups(new, gi); @@ -76,7 +76,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) if (ret < 0) goto error; - if (new->fsuid) + if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID)) new->cap_effective = cap_drop_nfsd_set(new->cap_effective); else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h index 78b3c0e93822..53325a12ba62 100644 --- a/fs/nfsd/auth.h +++ b/fs/nfsd/auth.h @@ -1,6 +1,5 @@ /* * nfsd-specific authentication stuff. - * uid/gid mapping not yet implemented. * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ @@ -8,11 +7,6 @@ #ifndef LINUX_NFSD_AUTH_H #define LINUX_NFSD_AUTH_H -#define nfsd_luid(rq, uid) ((u32)(uid)) -#define nfsd_lgid(rq, gid) ((u32)(gid)) -#define nfsd_ruid(rq, uid) ((u32)(uid)) -#define nfsd_rgid(rq, gid) ((u32)(gid)) - /* * Set the current process's fsuid/fsgid etc to those of the NFS * client user diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 93cc9d34c459..87fd1410b737 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -12,6 +12,10 @@ /* * Representation of a reply cache entry. + * + * Note that we use a sockaddr_in6 to hold the address instead of the more + * typical sockaddr_storage. This is for space reasons, since sockaddr_storage + * is much larger than a sockaddr_in6. */ struct svc_cacherep { struct hlist_node c_hash; @@ -20,11 +24,13 @@ struct svc_cacherep { unsigned char c_state, /* unused, inprog, done */ c_type, /* status, buffer */ c_secure : 1; /* req came from port < 1024 */ - struct sockaddr_in c_addr; + struct sockaddr_in6 c_addr; __be32 c_xid; u32 c_prot; u32 c_proc; u32 c_vers; + unsigned int c_len; + __wsum c_csum; unsigned long c_timestamp; union { struct kvec u_vec; @@ -46,8 +52,7 @@ enum { enum { RC_DROPIT, RC_REPLY, - RC_DOIT, - RC_INTR + RC_DOIT }; /* @@ -67,6 +72,12 @@ enum { */ #define RC_DELAY (HZ/5) +/* Cache entries expire after this time period */ +#define RC_EXPIRE (120 * HZ) + +/* Checksum this amount of the request */ +#define RC_CSUMLEN (256U) + int nfsd_reply_cache_init(void); void nfsd_reply_cache_shutdown(void); int nfsd_cache_lookup(struct svc_rqst *); diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index a3946cf13fc8..5f38ea36e266 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -67,11 +67,6 @@ static void expkey_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } -static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) -{ - return sunrpc_cache_pipe_upcall(cd, h, expkey_request); -} - static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, struct svc_expkey *old); static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *); @@ -245,7 +240,7 @@ static struct cache_detail svc_expkey_cache_template = { .hash_size = EXPKEY_HASHMAX, .name = "nfsd.fh", .cache_put = expkey_put, - .cache_upcall = expkey_upcall, + .cache_request = expkey_request, .cache_parse = expkey_parse, .cache_show = expkey_show, .match = expkey_match, @@ -315,6 +310,7 @@ static void svc_export_put(struct kref *ref) path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); + kfree(exp->ex_uuid); kfree(exp); } @@ -337,11 +333,6 @@ static void svc_export_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } -static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) -{ - return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); -} - static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); @@ -544,13 +535,17 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) err = get_int(&mesg, &an_int); if (err) goto out3; - exp.ex_anon_uid= an_int; + exp.ex_anon_uid= make_kuid(&init_user_ns, an_int); + if (!uid_valid(exp.ex_anon_uid)) + goto out3; /* anon gid */ err = get_int(&mesg, &an_int); if (err) goto out3; - exp.ex_anon_gid= an_int; + exp.ex_anon_gid= make_kgid(&init_user_ns, an_int); + if (!gid_valid(exp.ex_anon_gid)) + goto out3; /* fsid */ err = get_int(&mesg, &an_int); @@ -613,7 +608,7 @@ out: } static void exp_flags(struct seq_file *m, int flag, int fsid, - uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs); + kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs); static void show_secinfo(struct seq_file *m, struct svc_export *exp); static int svc_export_show(struct seq_file *m, @@ -670,6 +665,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; + new->ex_uuid = NULL; new->cd = item->cd; } @@ -711,7 +707,7 @@ static struct cache_detail svc_export_cache_template = { .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", .cache_put = svc_export_put, - .cache_upcall = svc_export_upcall, + .cache_request = svc_export_request, .cache_parse = svc_export_parse, .cache_show = svc_export_show, .match = svc_export_match, @@ -1179,15 +1175,17 @@ static void show_secinfo(struct seq_file *m, struct svc_export *exp) } static void exp_flags(struct seq_file *m, int flag, int fsid, - uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc) + kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc) { show_expflags(m, flag, NFSEXP_ALLFLAGS); if (flag & NFSEXP_FSID) seq_printf(m, ",fsid=%d", fsid); - if (anonu != (uid_t)-2 && anonu != (0x10000-2)) - seq_printf(m, ",anonuid=%u", anonu); - if (anong != (gid_t)-2 && anong != (0x10000-2)) - seq_printf(m, ",anongid=%u", anong); + if (!uid_eq(anonu, make_kuid(&init_user_ns, (uid_t)-2)) && + !uid_eq(anonu, make_kuid(&init_user_ns, 0x10000-2))) + seq_printf(m, ",anonuid=%u", from_kuid(&init_user_ns, anonu)); + if (!gid_eq(anong, make_kgid(&init_user_ns, (gid_t)-2)) && + !gid_eq(anong, make_kgid(&init_user_ns, 0x10000-2))) + seq_printf(m, ",anongid=%u", from_kgid(&init_user_ns, anong)); if (fsloc && fsloc->locations_count > 0) { char *loctype = (fsloc->migrated) ? "refer" : "replicas"; int i; diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index e761ee95617f..d620e7f81429 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -9,7 +9,7 @@ #include <linux/debugfs.h> #include <linux/module.h> #include <linux/nsproxy.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <asm/uaccess.h> #include "state.h" @@ -101,7 +101,7 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf, loff_t pos = *ppos; if (!pos) - nfsd_inject_get(file->f_dentry->d_inode->i_private, &val); + nfsd_inject_get(file_inode(file)->i_private, &val); size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); if (pos < 0) @@ -133,10 +133,10 @@ static ssize_t fault_inject_write(struct file *file, const char __user *buf, size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa)); if (size > 0) - nfsd_inject_set_client(file->f_dentry->d_inode->i_private, &sa, size); + nfsd_inject_set_client(file_inode(file)->i_private, &sa, size); else { val = simple_strtoll(write_buf, NULL, 0); - nfsd_inject_set(file->f_dentry->d_inode->i_private, val); + nfsd_inject_set(file_inode(file)->i_private, val); } return len; /* on success, claim we got the whole input */ } diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h index 9d513efc01ba..bf95f6b817a4 100644 --- a/fs/nfsd/idmap.h +++ b/fs/nfsd/idmap.h @@ -54,9 +54,9 @@ static inline void nfsd_idmap_shutdown(struct net *net) } #endif -__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *); -__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *); -int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *); -int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *); +__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *); +__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *); +int nfsd_map_uid_to_name(struct svc_rqst *, kuid_t, char *); +int nfsd_map_gid_to_name(struct svc_rqst *, kgid_t, char *); #endif /* LINUX_NFSD_IDMAP_H */ diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 9170861c804a..95d76dc6c5da 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -45,6 +45,10 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, RETURN_STATUS(nfserr_inval); resp->mask = argp->mask; + nfserr = fh_getattr(fh, &resp->stat); + if (nfserr) + goto fail; + if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); if (IS_ERR(acl)) { @@ -115,6 +119,9 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, nfserr = nfserrno( nfsd_set_posix_acl( fh, ACL_TYPE_DEFAULT, argp->acl_default) ); } + if (!nfserr) { + nfserr = fh_getattr(fh, &resp->stat); + } /* argp->acl_{access,default} may have been allocated in nfssvc_decode_setaclargs. */ @@ -129,10 +136,15 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, struct nfsd_attrstat *resp) { + __be32 nfserr; dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - return fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (nfserr) + return nfserr; + nfserr = fh_getattr(&resp->fh, &resp->stat); + return nfserr; } /* @@ -150,6 +162,9 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessarg fh_copy(&resp->fh, &argp->fh); resp->access = argp->access; nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); + if (nfserr) + return nfserr; + nfserr = fh_getattr(&resp->fh, &resp->stat); return nfserr; } @@ -243,7 +258,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, return 0; inode = dentry->d_inode; - p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); *p++ = htonl(resp->mask); if (!xdr_ressize_check(rqstp, p)) return 0; @@ -274,7 +289,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p, struct nfsd_attrstat *resp) { - p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); return xdr_ressize_check(rqstp, p); } @@ -282,7 +297,7 @@ static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p, static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_accessres *resp) { - p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); *p++ = htonl(resp->access); return xdr_ressize_check(rqstp, p); } diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 1fc02dfdc5c4..401289913130 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -43,7 +43,6 @@ static __be32 nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, struct nfsd3_attrstat *resp) { - int err; __be32 nfserr; dprintk("nfsd: GETATTR(3) %s\n", @@ -55,9 +54,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, if (nfserr) RETURN_STATUS(nfserr); - err = vfs_getattr(resp->fh.fh_export->ex_path.mnt, - resp->fh.fh_dentry, &resp->stat); - nfserr = nfserrno(err); + nfserr = fh_getattr(&resp->fh, &resp->stat); RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 324c0baf7cda..14d9ecb96cff 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -11,6 +11,7 @@ #include "xdr3.h" #include "auth.h" #include "netns.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_XDR @@ -105,12 +106,14 @@ decode_sattr3(__be32 *p, struct iattr *iap) iap->ia_mode = ntohl(*p++); } if (*p++) { - iap->ia_valid |= ATTR_UID; - iap->ia_uid = ntohl(*p++); + iap->ia_uid = make_kuid(&init_user_ns, ntohl(*p++)); + if (uid_valid(iap->ia_uid)) + iap->ia_valid |= ATTR_UID; } if (*p++) { - iap->ia_valid |= ATTR_GID; - iap->ia_gid = ntohl(*p++); + iap->ia_gid = make_kgid(&init_user_ns, ntohl(*p++)); + if (gid_valid(iap->ia_gid)) + iap->ia_valid |= ATTR_GID; } if (*p++) { u64 newsize; @@ -167,8 +170,8 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); *p++ = htonl((u32) stat->mode); *p++ = htonl((u32) stat->nlink); - *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); - *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); + *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); + *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) { p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); } else { @@ -204,10 +207,10 @@ encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) { struct dentry *dentry = fhp->fh_dentry; if (dentry && dentry->d_inode) { - int err; + __be32 err; struct kstat stat; - err = vfs_getattr(fhp->fh_export->ex_path.mnt, dentry, &stat); + err = fh_getattr(fhp, &stat); if (!err) { *p++ = xdr_one; /* attributes follow */ lease_get_mtime(dentry->d_inode, &stat.mtime); @@ -254,13 +257,12 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) */ void fill_post_wcc(struct svc_fh *fhp) { - int err; + __be32 err; if (fhp->fh_post_saved) printk("nfsd: inode locked twice during operation.\n"); - err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, - &fhp->fh_post_attr); + err = fh_getattr(fhp, &fhp->fh_post_attr); fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version; if (err) { fhp->fh_post_saved = 0; diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 9c51aff02ae2..8a50b3c18093 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -264,7 +264,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, ace->flag = eflag; ace->access_mask = deny_mask_from_posix(deny, flags); ace->whotype = NFS4_ACL_WHO_NAMED; - ace->who = pa->e_id; + ace->who_uid = pa->e_uid; ace++; acl->naces++; } @@ -273,7 +273,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, flags); ace->whotype = NFS4_ACL_WHO_NAMED; - ace->who = pa->e_id; + ace->who_uid = pa->e_uid; ace++; acl->naces++; pa++; @@ -300,7 +300,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, flags); ace->whotype = NFS4_ACL_WHO_NAMED; - ace->who = pa->e_id; + ace->who_gid = pa->e_gid; ace++; acl->naces++; pa++; @@ -329,7 +329,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; ace->access_mask = deny_mask_from_posix(deny, flags); ace->whotype = NFS4_ACL_WHO_NAMED; - ace->who = pa->e_id; + ace->who_gid = pa->e_gid; ace++; acl->naces++; } @@ -345,6 +345,18 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, acl->naces++; } +static bool +pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2) +{ + if (pace1->e_tag != pace2->e_tag) + return pace1->e_tag > pace2->e_tag; + if (pace1->e_tag == ACL_USER) + return uid_gt(pace1->e_uid, pace2->e_uid); + if (pace1->e_tag == ACL_GROUP) + return gid_gt(pace1->e_gid, pace2->e_gid); + return false; +} + static void sort_pacl_range(struct posix_acl *pacl, int start, int end) { int sorted = 0, i; @@ -355,8 +367,8 @@ sort_pacl_range(struct posix_acl *pacl, int start, int end) { while (!sorted) { sorted = 1; for (i = start; i < end; i++) { - if (pacl->a_entries[i].e_id - > pacl->a_entries[i+1].e_id) { + if (pace_gt(&pacl->a_entries[i], + &pacl->a_entries[i+1])) { sorted = 0; tmp = pacl->a_entries[i]; pacl->a_entries[i] = pacl->a_entries[i+1]; @@ -398,7 +410,10 @@ struct posix_ace_state { }; struct posix_user_ace_state { - uid_t uid; + union { + kuid_t uid; + kgid_t gid; + }; struct posix_ace_state perms; }; @@ -521,7 +536,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) if (error) goto out_err; low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags); - pace->e_id = ACL_UNDEFINED_ID; for (i=0; i < state->users->n; i++) { pace++; @@ -531,7 +545,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) goto out_err; low_mode_from_nfs4(state->users->aces[i].perms.allow, &pace->e_perm, flags); - pace->e_id = state->users->aces[i].uid; + pace->e_uid = state->users->aces[i].uid; add_to_mask(state, &state->users->aces[i].perms); } @@ -541,7 +555,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) if (error) goto out_err; low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags); - pace->e_id = ACL_UNDEFINED_ID; add_to_mask(state, &state->group); for (i=0; i < state->groups->n; i++) { @@ -552,14 +565,13 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) goto out_err; low_mode_from_nfs4(state->groups->aces[i].perms.allow, &pace->e_perm, flags); - pace->e_id = state->groups->aces[i].uid; + pace->e_gid = state->groups->aces[i].gid; add_to_mask(state, &state->groups->aces[i].perms); } pace++; pace->e_tag = ACL_MASK; low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); - pace->e_id = ACL_UNDEFINED_ID; pace++; pace->e_tag = ACL_OTHER; @@ -567,7 +579,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) if (error) goto out_err; low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags); - pace->e_id = ACL_UNDEFINED_ID; return pacl; out_err: @@ -587,12 +598,13 @@ static inline void deny_bits(struct posix_ace_state *astate, u32 mask) astate->deny |= mask & ~astate->allow; } -static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array *a, uid_t uid) +static int find_uid(struct posix_acl_state *state, kuid_t uid) { + struct posix_ace_state_array *a = state->users; int i; for (i = 0; i < a->n; i++) - if (a->aces[i].uid == uid) + if (uid_eq(a->aces[i].uid, uid)) return i; /* Not found: */ a->n++; @@ -603,6 +615,23 @@ static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array return i; } +static int find_gid(struct posix_acl_state *state, kgid_t gid) +{ + struct posix_ace_state_array *a = state->groups; + int i; + + for (i = 0; i < a->n; i++) + if (gid_eq(a->aces[i].gid, gid)) + return i; + /* Not found: */ + a->n++; + a->aces[i].gid = gid; + a->aces[i].perms.allow = state->everyone.allow; + a->aces[i].perms.deny = state->everyone.deny; + + return i; +} + static void deny_bits_array(struct posix_ace_state_array *a, u32 mask) { int i; @@ -636,7 +665,7 @@ static void process_one_v4_ace(struct posix_acl_state *state, } break; case ACL_USER: - i = find_uid(state, state->users, ace->who); + i = find_uid(state, ace->who_uid); if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { allow_bits(&state->users->aces[i].perms, mask); } else { @@ -658,7 +687,7 @@ static void process_one_v4_ace(struct posix_acl_state *state, } break; case ACL_GROUP: - i = find_uid(state, state->groups, ace->who); + i = find_gid(state, ace->who_gid); if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { allow_bits(&state->groups->aces[i].perms, mask); } else { diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index a1f10c0a6255..4832fd819f88 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -65,7 +65,7 @@ MODULE_PARM_DESC(nfs4_disable_idmapping, struct ent { struct cache_head h; int type; /* User / Group */ - uid_t id; + u32 id; char name[IDMAP_NAMESZ]; char authname[IDMAP_NAMESZ]; }; @@ -140,12 +140,6 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, } static int -idtoname_upcall(struct cache_detail *cd, struct cache_head *ch) -{ - return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request); -} - -static int idtoname_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); @@ -192,7 +186,7 @@ static struct cache_detail idtoname_cache_template = { .hash_size = ENT_HASHMAX, .name = "nfs4.idtoname", .cache_put = ent_put, - .cache_upcall = idtoname_upcall, + .cache_request = idtoname_request, .cache_parse = idtoname_parse, .cache_show = idtoname_show, .warn_no_listener = warn_no_idmapd, @@ -321,12 +315,6 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, } static int -nametoid_upcall(struct cache_detail *cd, struct cache_head *ch) -{ - return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request); -} - -static int nametoid_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); @@ -365,7 +353,7 @@ static struct cache_detail nametoid_cache_template = { .hash_size = ENT_HASHMAX, .name = "nfs4.nametoid", .cache_put = ent_put, - .cache_upcall = nametoid_upcall, + .cache_request = nametoid_request, .cache_parse = nametoid_parse, .cache_show = nametoid_show, .warn_no_listener = warn_no_idmapd, @@ -540,7 +528,7 @@ rqst_authname(struct svc_rqst *rqstp) static __be32 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, - uid_t *id) + u32 *id) { struct ent *item, key = { .type = type, @@ -564,7 +552,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen } static int -idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) +idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name) { struct ent *item, key = { .id = id, @@ -587,7 +575,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) } static bool -numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) +numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { int ret; char buf[11]; @@ -603,7 +591,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel } static __be32 -do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) +do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) if (numeric_name_to_id(rqstp, type, name, namelen, id)) @@ -616,7 +604,7 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u } static int -do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) +do_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) return sprintf(name, "%u", id); @@ -625,26 +613,40 @@ do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) __be32 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, - __u32 *id) + kuid_t *uid) { - return do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id); + __be32 status; + u32 id = -1; + status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id); + *uid = make_kuid(&init_user_ns, id); + if (!uid_valid(*uid)) + status = nfserr_badowner; + return status; } __be32 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, - __u32 *id) + kgid_t *gid) { - return do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, id); + __be32 status; + u32 id = -1; + status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id); + *gid = make_kgid(&init_user_ns, id); + if (!gid_valid(*gid)) + status = nfserr_badowner; + return status; } int -nfsd_map_uid_to_name(struct svc_rqst *rqstp, __u32 id, char *name) +nfsd_map_uid_to_name(struct svc_rqst *rqstp, kuid_t uid, char *name) { + u32 id = from_kuid(&init_user_ns, uid); return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name); } int -nfsd_map_gid_to_name(struct svc_rqst *rqstp, __u32 id, char *name) +nfsd_map_gid_to_name(struct svc_rqst *rqstp, kgid_t gid, char *name) { + u32 id = from_kgid(&init_user_ns, gid); return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 9d1c5dba2bbb..ae73175e6e68 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -993,14 +993,15 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!buf) return nfserr_jukebox; + p = buf; status = nfsd4_encode_fattr(&cstate->current_fh, cstate->current_fh.fh_export, - cstate->current_fh.fh_dentry, buf, - &count, verify->ve_bmval, + cstate->current_fh.fh_dentry, &p, + count, verify->ve_bmval, rqstp, 0); /* this means that nfsd4_encode_fattr() ran out of space */ - if (status == nfserr_resource && count == 0) + if (status == nfserr_resource) status = nfserr_not_same; if (status) goto out_kfree; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index ba6fdd4a0455..899ca26dd194 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -73,8 +73,8 @@ nfs4_save_creds(const struct cred **original_creds) if (!new) return -ENOMEM; - new->fsuid = 0; - new->fsgid = 0; + new->fsuid = GLOBAL_ROOT_UID; + new->fsgid = GLOBAL_ROOT_GID; *original_creds = override_creds(new); put_cred(new); return 0; @@ -1185,6 +1185,12 @@ bin_to_hex_dup(const unsigned char *src, int srclen) static int nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net) { + /* XXX: The usermode helper s not working in container yet. */ + if (net != &init_net) { + WARN(1, KERN_ERR "NFSD: attempt to initialize umh client " + "tracking in a container!\n"); + return -EINVAL; + } return nfsd4_umh_cltrack_upcall("init", NULL, NULL); } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ac8ed96c4199..2e27430b9070 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -40,7 +40,7 @@ #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sunrpc/svcauth_gss.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include "xdr4.h" #include "vfs.h" #include "current_stateid.h" @@ -151,7 +151,7 @@ get_nfs4_file(struct nfs4_file *fi) } static int num_delegations; -unsigned int max_delegations; +unsigned long max_delegations; /* * Open owner state (share locks) @@ -230,21 +230,28 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) __nfs4_file_put_access(fp, oflag); } -static inline int get_new_stid(struct nfs4_stid *stid) +static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct +kmem_cache *slab) { + struct idr *stateids = &cl->cl_stateids; static int min_stateid = 0; - struct idr *stateids = &stid->sc_client->cl_stateids; - int new_stid; - int error; + struct nfs4_stid *stid; + int new_id; + + stid = kmem_cache_alloc(slab, GFP_KERNEL); + if (!stid) + return NULL; + + new_id = idr_alloc(stateids, stid, min_stateid, 0, GFP_KERNEL); + if (new_id < 0) + goto out_free; + stid->sc_client = cl; + stid->sc_type = 0; + stid->sc_stateid.si_opaque.so_id = new_id; + stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; + /* Will be incremented before return to client: */ + stid->sc_stateid.si_generation = 0; - error = idr_get_new_above(stateids, stid, min_stateid, &new_stid); - /* - * Note: the necessary preallocation was done in - * nfs4_alloc_stateid(). The idr code caps the number of - * preallocations that can exist at a time, but the state lock - * prevents anyone from using ours before we get here: - */ - WARN_ON_ONCE(error); /* * It shouldn't be a problem to reuse an opaque stateid value. * I don't think it is for 4.1. But with 4.0 I worry that, for @@ -255,39 +262,13 @@ static inline int get_new_stid(struct nfs4_stid *stid) * "increase" (mod INT_MAX): */ - min_stateid = new_stid+1; + min_stateid = new_id+1; if (min_stateid == INT_MAX) min_stateid = 0; - return new_stid; -} - -static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type) -{ - stateid_t *s = &stid->sc_stateid; - int new_id; - - stid->sc_type = type; - stid->sc_client = cl; - s->si_opaque.so_clid = cl->cl_clientid; - new_id = get_new_stid(stid); - s->si_opaque.so_id = (u32)new_id; - /* Will be incremented before return to client: */ - s->si_generation = 0; -} - -static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) -{ - struct idr *stateids = &cl->cl_stateids; - - if (!idr_pre_get(stateids, GFP_KERNEL)) - return NULL; - /* - * Note: if we fail here (or any time between now and the time - * we actually get the new idr), we won't need to undo the idr - * preallocation, since the idr code caps the number of - * preallocated entries. - */ - return kmem_cache_alloc(slab, GFP_KERNEL); + return stid; +out_free: + kfree(stid); + return NULL; } static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) @@ -316,7 +297,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); if (dp == NULL) return dp; - init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID); + dp->dl_stid.sc_type = NFS4_DELEG_STID; /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid @@ -337,13 +318,21 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv return dp; } +static void free_stid(struct nfs4_stid *s, struct kmem_cache *slab) +{ + struct idr *stateids = &s->sc_client->cl_stateids; + + idr_remove(stateids, s->sc_stateid.si_opaque.so_id); + kmem_cache_free(slab, s); +} + void nfs4_put_delegation(struct nfs4_delegation *dp) { if (atomic_dec_and_test(&dp->dl_count)) { dprintk("NFSD: freeing dp %p\n",dp); put_nfs4_file(dp->dl_file); - kmem_cache_free(deleg_slab, dp); + free_stid(&dp->dl_stid, deleg_slab); num_delegations--; } } @@ -360,9 +349,7 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp) static void unhash_stid(struct nfs4_stid *s) { - struct idr *stateids = &s->sc_client->cl_stateids; - - idr_remove(stateids, s->sc_stateid.si_opaque.so_id); + s->sc_type = 0; } /* Called under the state lock. */ @@ -519,7 +506,7 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp) static void free_generic_stateid(struct nfs4_ol_stateid *stp) { - kmem_cache_free(stateid_slab, stp); + free_stid(&stp->st_stid, stateid_slab); } static void release_lock_stateid(struct nfs4_ol_stateid *stp) @@ -700,8 +687,8 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num) num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION); spin_lock(&nfsd_drc_lock); - avail = min_t(int, NFSD_MAX_MEM_PER_SESSION, - nfsd_drc_max_mem - nfsd_drc_mem_used); + avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, + nfsd_drc_max_mem - nfsd_drc_mem_used); num = min_t(int, num, avail / slotsize); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); @@ -905,7 +892,7 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan, new = __alloc_session(slotsize, numslots); if (!new) { - nfsd4_put_drc_mem(slotsize, fchan->maxreqs); + nfsd4_put_drc_mem(slotsize, numslots); return NULL; } init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn); @@ -1048,7 +1035,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) static inline void free_client(struct nfs4_client *clp) { - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); while (!list_empty(&clp->cl_sessions)) { @@ -1060,6 +1047,7 @@ free_client(struct nfs4_client *clp) } free_svc_cred(&clp->cl_cred); kfree(clp->cl_name.data); + idr_destroy(&clp->cl_stateids); kfree(clp); } @@ -1202,7 +1190,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2) if (g1->ngroups != g2->ngroups) return false; for (i=0; i<g1->ngroups; i++) - if (GROUP_AT(g1, i) != GROUP_AT(g2, i)) + if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i))) return false; return true; } @@ -1227,8 +1215,8 @@ static bool same_creds(struct svc_cred *cr1, struct svc_cred *cr2) { if ((is_gss_cred(cr1) != is_gss_cred(cr2)) - || (cr1->cr_uid != cr2->cr_uid) - || (cr1->cr_gid != cr2->cr_gid) + || (!uid_eq(cr1->cr_uid, cr2->cr_uid)) + || (!gid_eq(cr1->cr_gid, cr2->cr_gid)) || !groups_equal(cr1->cr_group_info, cr2->cr_group_info)) return false; if (cr1->cr_principal == cr2->cr_principal) @@ -1258,7 +1246,12 @@ static void gen_confirm(struct nfs4_client *clp) static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t) { - return idr_find(&cl->cl_stateids, t->si_opaque.so_id); + struct nfs4_stid *ret; + + ret = idr_find(&cl->cl_stateids, t->si_opaque.so_id); + if (!ret || !ret->sc_type) + return NULL; + return ret; } static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) @@ -1844,11 +1837,12 @@ nfsd4_create_session(struct svc_rqst *rqstp, /* cache solo and embedded create sessions under the state lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); -out: nfs4_unlock_state(); +out: dprintk("%s returns %d\n", __func__, ntohl(status)); return status; out_free_conn: + nfs4_unlock_state(); free_conn(conn); out_free_session: __free_session(new); @@ -2443,9 +2437,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; - struct nfs4_client *clp = oo->oo_owner.so_client; - init_stid(&stp->st_stid, clp, NFS4_OPEN_STID); + stp->st_stid.sc_type = NFS4_OPEN_STID; INIT_LIST_HEAD(&stp->st_lockowners); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); @@ -4031,7 +4024,7 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct stp = nfs4_alloc_stateid(clp); if (stp == NULL) return NULL; - init_stid(&stp->st_stid, clp, NFS4_LOCK_STID); + stp->st_stid.sc_type = NFS4_LOCK_STID; list_add(&stp->st_perfile, &fp->fi_stateids); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); stp->st_stateowner = &lo->lo_owner; @@ -4913,16 +4906,6 @@ nfs4_state_start_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; - /* - * FIXME: For now, we hang most of the pernet global stuff off of - * init_net until nfsd is fully containerized. Eventually, we'll - * need to pass a net pointer into this function, take a reference - * to that instead and then do most of the rest of this on a per-net - * basis. - */ - if (net != &init_net) - return -EINVAL; - ret = nfs4_state_create_net(net); if (ret) return ret; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0dc11586682f..01168865dd37 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -293,13 +293,13 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, ace->whotype = nfs4_acl_get_whotype(buf, dummy32); status = nfs_ok; if (ace->whotype != NFS4_ACL_WHO_NAMED) - ace->who = 0; + ; else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) status = nfsd_map_name_to_gid(argp->rqstp, - buf, dummy32, &ace->who); + buf, dummy32, &ace->who_gid); else status = nfsd_map_name_to_uid(argp->rqstp, - buf, dummy32, &ace->who); + buf, dummy32, &ace->who_uid); if (status) return status; } @@ -464,9 +464,16 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ READ32(dummy); READ_BUF(dummy * 4); if (cbs->flavor == (u32)(-1)) { - cbs->uid = uid; - cbs->gid = gid; - cbs->flavor = RPC_AUTH_UNIX; + kuid_t kuid = make_kuid(&init_user_ns, uid); + kgid_t kgid = make_kgid(&init_user_ns, gid); + if (uid_valid(kuid) && gid_valid(kgid)) { + cbs->uid = kuid; + cbs->gid = kgid; + cbs->flavor = RPC_AUTH_UNIX; + } else { + dprintk("RPC_AUTH_UNIX with invalid" + "uid or gid ignoring!\n"); + } } break; case RPC_AUTH_GSS: @@ -1926,7 +1933,7 @@ static u32 nfs4_file_type(umode_t mode) } static __be32 -nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, +nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, kuid_t uid, kgid_t gid, __be32 **p, int *buflen) { int status; @@ -1935,10 +1942,10 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, return nfserr_resource; if (whotype != NFS4_ACL_WHO_NAMED) status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1)); - else if (group) - status = nfsd_map_gid_to_name(rqstp, id, (u8 *)(*p + 1)); + else if (gid_valid(gid)) + status = nfsd_map_gid_to_name(rqstp, gid, (u8 *)(*p + 1)); else - status = nfsd_map_uid_to_name(rqstp, id, (u8 *)(*p + 1)); + status = nfsd_map_uid_to_name(rqstp, uid, (u8 *)(*p + 1)); if (status < 0) return nfserrno(status); *p = xdr_encode_opaque(*p, NULL, status); @@ -1948,22 +1955,33 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, } static inline __be32 -nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, __be32 **p, int *buflen) +nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t user, __be32 **p, int *buflen) { - return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen); + return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, user, INVALID_GID, + p, buflen); } static inline __be32 -nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, __be32 **p, int *buflen) +nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t group, __be32 **p, int *buflen) { - return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen); + return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, INVALID_UID, group, + p, buflen); } static inline __be32 -nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group, +nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace, __be32 **p, int *buflen) { - return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen); + kuid_t uid = INVALID_UID; + kgid_t gid = INVALID_GID; + + if (ace->whotype == NFS4_ACL_WHO_NAMED) { + if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + gid = ace->who_gid; + else + uid = ace->who_uid; + } + return nfsd4_encode_name(rqstp, ace->whotype, uid, gid, p, buflen); } #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ @@ -1997,7 +2015,7 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) if (path.dentry != path.mnt->mnt_root) break; } - err = vfs_getattr(path.mnt, path.dentry, stat); + err = vfs_getattr(&path, stat); path_put(&path); return err; } @@ -2006,12 +2024,11 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) * Note: @fhp can be NULL; in this case, we might have to compose the filehandle * ourselves. * - * @countp is the buffer size in _words_; upon successful return this becomes - * replaced with the number of words written. + * countp is the buffer size in _words_ */ __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval, + struct dentry *dentry, __be32 **buffer, int count, u32 *bmval, struct svc_rqst *rqstp, int ignore_crossmnt) { u32 bmval0 = bmval[0]; @@ -2020,12 +2037,12 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, struct kstat stat; struct svc_fh tempfh; struct kstatfs statfs; - int buflen = *countp << 2; + int buflen = count << 2; __be32 *attrlenp; u32 dummy; u64 dummy64; u32 rdattr_err = 0; - __be32 *p = buffer; + __be32 *p = *buffer; __be32 status; int err; int aclsupport = 0; @@ -2050,7 +2067,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, goto out; } - err = vfs_getattr(exp->ex_path.mnt, dentry, &stat); + err = vfs_getattr(&path, &stat); if (err) goto out_nfserr; if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | @@ -2224,9 +2241,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, WRITE32(ace->type); WRITE32(ace->flag); WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL); - status = nfsd4_encode_aclname(rqstp, ace->whotype, - ace->who, ace->flag & NFS4_ACE_IDENTIFIER_GROUP, - &p, &buflen); + status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen); if (status == nfserr_resource) goto out_resource; if (status) @@ -2431,7 +2446,7 @@ out_acl: } *attrlenp = htonl((char *)p - (char *)attrlenp - 4); - *countp = p - buffer; + *buffer = p; status = nfs_ok; out: @@ -2443,7 +2458,6 @@ out_nfserr: status = nfserrno(err); goto out; out_resource: - *countp = 0; status = nfserr_resource; goto out; out_serverfault: @@ -2462,7 +2476,7 @@ static inline int attributes_need_mount(u32 *bmval) static __be32 nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, - const char *name, int namlen, __be32 *p, int *buflen) + const char *name, int namlen, __be32 **p, int buflen) { struct svc_export *exp = cd->rd_fhp->fh_export; struct dentry *dentry; @@ -2568,10 +2582,9 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ p = xdr_encode_array(p, name, namlen); /* name length & name */ - nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, p, &buflen); + nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, &p, buflen); switch (nfserr) { case nfs_ok: - p += buflen; break; case nfserr_resource: nfserr = nfserr_toosmall; @@ -2698,10 +2711,8 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2); nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, - resp->p, &buflen, getattr->ga_bmval, + &resp->p, buflen, getattr->ga_bmval, resp->rqstp, 0); - if (!nfserr) - resp->p += buflen; return nfserr; } diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 2cbac34a55da..ca05f6dc3544 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -9,22 +9,22 @@ */ #include <linux/slab.h> +#include <linux/sunrpc/addr.h> +#include <linux/highmem.h> +#include <net/checksum.h> #include "nfsd.h" #include "cache.h" -/* Size of reply cache. Common values are: - * 4.3BSD: 128 - * 4.4BSD: 256 - * Solaris2: 1024 - * DEC Unix: 512-4096 - */ -#define CACHESIZE 1024 +#define NFSDDBG_FACILITY NFSDDBG_REPCACHE + #define HASHSIZE 64 static struct hlist_head * cache_hash; static struct list_head lru_head; -static int cache_disabled = 1; +static struct kmem_cache *drc_slab; +static unsigned int num_drc_entries; +static unsigned int max_drc_entries; /* * Calculate the hash index from an XID. @@ -37,6 +37,14 @@ static inline u32 request_hash(u32 xid) } static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); +static void cache_cleaner_func(struct work_struct *unused); +static int nfsd_reply_cache_shrink(struct shrinker *shrink, + struct shrink_control *sc); + +struct shrinker nfsd_reply_cache_shrinker = { + .shrink = nfsd_reply_cache_shrink, + .seeks = 1, +}; /* * locking for the reply cache: @@ -44,30 +52,87 @@ static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); * Otherwise, it when accessing _prev or _next, the lock must be held. */ static DEFINE_SPINLOCK(cache_lock); +static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func); -int nfsd_reply_cache_init(void) +/* + * Put a cap on the size of the DRC based on the amount of available + * low memory in the machine. + * + * 64MB: 8192 + * 128MB: 11585 + * 256MB: 16384 + * 512MB: 23170 + * 1GB: 32768 + * 2GB: 46340 + * 4GB: 65536 + * 8GB: 92681 + * 16GB: 131072 + * + * ...with a hard cap of 256k entries. In the worst case, each entry will be + * ~1k, so the above numbers should give a rough max of the amount of memory + * used in k. + */ +static unsigned int +nfsd_cache_size_limit(void) +{ + unsigned int limit; + unsigned long low_pages = totalram_pages - totalhigh_pages; + + limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10); + return min_t(unsigned int, limit, 256*1024); +} + +static struct svc_cacherep * +nfsd_reply_cache_alloc(void) { struct svc_cacherep *rp; - int i; - INIT_LIST_HEAD(&lru_head); - i = CACHESIZE; - while (i) { - rp = kmalloc(sizeof(*rp), GFP_KERNEL); - if (!rp) - goto out_nomem; - list_add(&rp->c_lru, &lru_head); + rp = kmem_cache_alloc(drc_slab, GFP_KERNEL); + if (rp) { rp->c_state = RC_UNUSED; rp->c_type = RC_NOCACHE; + INIT_LIST_HEAD(&rp->c_lru); INIT_HLIST_NODE(&rp->c_hash); - i--; } + return rp; +} + +static void +nfsd_reply_cache_free_locked(struct svc_cacherep *rp) +{ + if (rp->c_type == RC_REPLBUFF) + kfree(rp->c_replvec.iov_base); + if (!hlist_unhashed(&rp->c_hash)) + hlist_del(&rp->c_hash); + list_del(&rp->c_lru); + --num_drc_entries; + kmem_cache_free(drc_slab, rp); +} + +static void +nfsd_reply_cache_free(struct svc_cacherep *rp) +{ + spin_lock(&cache_lock); + nfsd_reply_cache_free_locked(rp); + spin_unlock(&cache_lock); +} + +int nfsd_reply_cache_init(void) +{ + INIT_LIST_HEAD(&lru_head); + max_drc_entries = nfsd_cache_size_limit(); + num_drc_entries = 0; + + register_shrinker(&nfsd_reply_cache_shrinker); + drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), + 0, 0, NULL); + if (!drc_slab) + goto out_nomem; - cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); + cache_hash = kcalloc(HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!cache_hash) goto out_nomem; - cache_disabled = 0; return 0; out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); @@ -79,27 +144,33 @@ void nfsd_reply_cache_shutdown(void) { struct svc_cacherep *rp; + unregister_shrinker(&nfsd_reply_cache_shrinker); + cancel_delayed_work_sync(&cache_cleaner); + while (!list_empty(&lru_head)) { rp = list_entry(lru_head.next, struct svc_cacherep, c_lru); - if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF) - kfree(rp->c_replvec.iov_base); - list_del(&rp->c_lru); - kfree(rp); + nfsd_reply_cache_free_locked(rp); } - cache_disabled = 1; - kfree (cache_hash); cache_hash = NULL; + + if (drc_slab) { + kmem_cache_destroy(drc_slab); + drc_slab = NULL; + } } /* - * Move cache entry to end of LRU list + * Move cache entry to end of LRU list, and queue the cleaner to run if it's + * not already scheduled. */ static void lru_put_end(struct svc_cacherep *rp) { + rp->c_timestamp = jiffies; list_move_tail(&rp->c_lru, &lru_head); + schedule_delayed_work(&cache_cleaner, RC_EXPIRE); } /* @@ -112,83 +183,214 @@ hash_refile(struct svc_cacherep *rp) hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid)); } +static inline bool +nfsd_cache_entry_expired(struct svc_cacherep *rp) +{ + return rp->c_state != RC_INPROG && + time_after(jiffies, rp->c_timestamp + RC_EXPIRE); +} + +/* + * Walk the LRU list and prune off entries that are older than RC_EXPIRE. + * Also prune the oldest ones when the total exceeds the max number of entries. + */ +static void +prune_cache_entries(void) +{ + struct svc_cacherep *rp, *tmp; + + list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { + if (!nfsd_cache_entry_expired(rp) && + num_drc_entries <= max_drc_entries) + break; + nfsd_reply_cache_free_locked(rp); + } + + /* + * Conditionally rearm the job. If we cleaned out the list, then + * cancel any pending run (since there won't be any work to do). + * Otherwise, we rearm the job or modify the existing one to run in + * RC_EXPIRE since we just ran the pruner. + */ + if (list_empty(&lru_head)) + cancel_delayed_work(&cache_cleaner); + else + mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); +} + +static void +cache_cleaner_func(struct work_struct *unused) +{ + spin_lock(&cache_lock); + prune_cache_entries(); + spin_unlock(&cache_lock); +} + +static int +nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned int num; + + spin_lock(&cache_lock); + if (sc->nr_to_scan) + prune_cache_entries(); + num = num_drc_entries; + spin_unlock(&cache_lock); + + return num; +} + +/* + * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes + */ +static __wsum +nfsd_cache_csum(struct svc_rqst *rqstp) +{ + int idx; + unsigned int base; + __wsum csum; + struct xdr_buf *buf = &rqstp->rq_arg; + const unsigned char *p = buf->head[0].iov_base; + size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len, + RC_CSUMLEN); + size_t len = min(buf->head[0].iov_len, csum_len); + + /* rq_arg.head first */ + csum = csum_partial(p, len, 0); + csum_len -= len; + + /* Continue into page array */ + idx = buf->page_base / PAGE_SIZE; + base = buf->page_base & ~PAGE_MASK; + while (csum_len) { + p = page_address(buf->pages[idx]) + base; + len = min_t(size_t, PAGE_SIZE - base, csum_len); + csum = csum_partial(p, len, csum); + csum_len -= len; + base = 0; + ++idx; + } + return csum; +} + +/* + * Search the request hash for an entry that matches the given rqstp. + * Must be called with cache_lock held. Returns the found entry or + * NULL on failure. + */ +static struct svc_cacherep * +nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum) +{ + struct svc_cacherep *rp; + struct hlist_head *rh; + __be32 xid = rqstp->rq_xid; + u32 proto = rqstp->rq_prot, + vers = rqstp->rq_vers, + proc = rqstp->rq_proc; + + rh = &cache_hash[request_hash(xid)]; + hlist_for_each_entry(rp, rh, c_hash) { + if (xid == rp->c_xid && proc == rp->c_proc && + proto == rp->c_prot && vers == rp->c_vers && + rqstp->rq_arg.len == rp->c_len && csum == rp->c_csum && + rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) && + rpc_get_port(svc_addr(rqstp)) == rpc_get_port((struct sockaddr *)&rp->c_addr)) + return rp; + } + return NULL; +} + /* * Try to find an entry matching the current call in the cache. When none - * is found, we grab the oldest unlocked entry off the LRU list. - * Note that no operation within the loop may sleep. + * is found, we try to grab the oldest expired entry off the LRU list. If + * a suitable one isn't there, then drop the cache_lock and allocate a + * new one, then search again in case one got inserted while this thread + * didn't hold the lock. */ int nfsd_cache_lookup(struct svc_rqst *rqstp) { - struct hlist_node *hn; - struct hlist_head *rh; - struct svc_cacherep *rp; + struct svc_cacherep *rp, *found; __be32 xid = rqstp->rq_xid; u32 proto = rqstp->rq_prot, vers = rqstp->rq_vers, proc = rqstp->rq_proc; + __wsum csum; unsigned long age; int type = rqstp->rq_cachetype; int rtn; rqstp->rq_cacherep = NULL; - if (cache_disabled || type == RC_NOCACHE) { + if (type == RC_NOCACHE) { nfsdstats.rcnocache++; return RC_DOIT; } + csum = nfsd_cache_csum(rqstp); + spin_lock(&cache_lock); rtn = RC_DOIT; - rh = &cache_hash[request_hash(xid)]; - hlist_for_each_entry(rp, hn, rh, c_hash) { - if (rp->c_state != RC_UNUSED && - xid == rp->c_xid && proc == rp->c_proc && - proto == rp->c_prot && vers == rp->c_vers && - time_before(jiffies, rp->c_timestamp + 120*HZ) && - memcmp((char*)&rqstp->rq_addr, (char*)&rp->c_addr, sizeof(rp->c_addr))==0) { - nfsdstats.rchits++; - goto found_entry; + rp = nfsd_cache_search(rqstp, csum); + if (rp) + goto found_entry; + + /* Try to use the first entry on the LRU */ + if (!list_empty(&lru_head)) { + rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru); + if (nfsd_cache_entry_expired(rp) || + num_drc_entries >= max_drc_entries) { + lru_put_end(rp); + prune_cache_entries(); + goto setup_entry; } } - nfsdstats.rcmisses++; - /* This loop shouldn't take more than a few iterations normally */ - { - int safe = 0; - list_for_each_entry(rp, &lru_head, c_lru) { - if (rp->c_state != RC_INPROG) - break; - if (safe++ > CACHESIZE) { - printk("nfsd: loop in repcache LRU list\n"); - cache_disabled = 1; - goto out; - } + /* Drop the lock and allocate a new entry */ + spin_unlock(&cache_lock); + rp = nfsd_reply_cache_alloc(); + if (!rp) { + dprintk("nfsd: unable to allocate DRC entry!\n"); + return RC_DOIT; } + spin_lock(&cache_lock); + ++num_drc_entries; + + /* + * Must search again just in case someone inserted one + * after we dropped the lock above. + */ + found = nfsd_cache_search(rqstp, csum); + if (found) { + nfsd_reply_cache_free_locked(rp); + rp = found; + goto found_entry; } - /* All entries on the LRU are in-progress. This should not happen */ - if (&rp->c_lru == &lru_head) { - static int complaints; - - printk(KERN_WARNING "nfsd: all repcache entries locked!\n"); - if (++complaints > 5) { - printk(KERN_WARNING "nfsd: disabling repcache.\n"); - cache_disabled = 1; - } - goto out; - } + /* + * We're keeping the one we just allocated. Are we now over the + * limit? Prune one off the tip of the LRU in trade for the one we + * just allocated if so. + */ + if (num_drc_entries >= max_drc_entries) + nfsd_reply_cache_free_locked(list_first_entry(&lru_head, + struct svc_cacherep, c_lru)); +setup_entry: + nfsdstats.rcmisses++; rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; rp->c_xid = xid; rp->c_proc = proc; - memcpy(&rp->c_addr, svc_addr_in(rqstp), sizeof(rp->c_addr)); + rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp)); + rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp))); rp->c_prot = proto; rp->c_vers = vers; - rp->c_timestamp = jiffies; + rp->c_len = rqstp->rq_arg.len; + rp->c_csum = csum; hash_refile(rp); + lru_put_end(rp); /* release any buffer */ if (rp->c_type == RC_REPLBUFF) { @@ -201,9 +403,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) return rtn; found_entry: + nfsdstats.rchits++; /* We found a matching entry which is either in progress or done. */ age = jiffies - rp->c_timestamp; - rp->c_timestamp = jiffies; lru_put_end(rp); rtn = RC_DROPIT; @@ -232,7 +434,7 @@ found_entry: break; default: printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type); - rp->c_state = RC_UNUSED; + nfsd_reply_cache_free_locked(rp); } goto out; @@ -257,11 +459,11 @@ found_entry: void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) { - struct svc_cacherep *rp; + struct svc_cacherep *rp = rqstp->rq_cacherep; struct kvec *resv = &rqstp->rq_res.head[0], *cachv; int len; - if (!(rp = rqstp->rq_cacherep) || cache_disabled) + if (!rp) return; len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); @@ -269,7 +471,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) /* Don't cache excessive amounts of data and XDR failures */ if (!statp || len > (256 >> 2)) { - rp->c_state = RC_UNUSED; + nfsd_reply_cache_free(rp); return; } @@ -283,21 +485,21 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) cachv = &rp->c_replvec; cachv->iov_base = kmalloc(len << 2, GFP_KERNEL); if (!cachv->iov_base) { - spin_lock(&cache_lock); - rp->c_state = RC_UNUSED; - spin_unlock(&cache_lock); + nfsd_reply_cache_free(rp); return; } cachv->iov_len = len << 2; memcpy(cachv->iov_base, statp, len << 2); break; + case RC_NOCACHE: + nfsd_reply_cache_free(rp); + return; } spin_lock(&cache_lock); lru_put_end(rp); rp->c_secure = rqstp->rq_secure; rp->c_type = cachetype; rp->c_state = RC_DONE; - rp->c_timestamp = jiffies; spin_unlock(&cache_lock); return; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 74934284d9a7..f33455b4d957 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -10,7 +10,7 @@ #include <linux/sunrpc/svcsock.h> #include <linux/lockd/lockd.h> -#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/gss_krb5_enctypes.h> #include <linux/sunrpc/rpc_pipe_fs.h> @@ -85,7 +85,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) { - ino_t ino = file->f_path.dentry->d_inode->i_ino; + ino_t ino = file_inode(file)->i_ino; char *data; ssize_t rv; @@ -125,11 +125,11 @@ static const struct file_operations transaction_ops = { .llseek = default_llseek, }; -static int exports_open(struct inode *inode, struct file *file) +static int exports_net_open(struct net *net, struct file *file) { int err; struct seq_file *seq; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); err = seq_open(file, &nfs_exports_op); if (err) @@ -140,8 +140,26 @@ static int exports_open(struct inode *inode, struct file *file) return 0; } -static const struct file_operations exports_operations = { - .open = exports_open, +static int exports_proc_open(struct inode *inode, struct file *file) +{ + return exports_net_open(current->nsproxy->net_ns, file); +} + +static const struct file_operations exports_proc_operations = { + .open = exports_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + +static int exports_nfsd_open(struct inode *inode, struct file *file) +{ + return exports_net_open(inode->i_sb->s_fs_info, file); +} + +static const struct file_operations exports_nfsd_operations = { + .open = exports_nfsd_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, @@ -220,6 +238,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) struct sockaddr *sap = (struct sockaddr *)&address; size_t salen = sizeof(address); char *fo_path; + struct net *net = file->f_dentry->d_sb->s_fs_info; /* sanity check */ if (size == 0) @@ -232,7 +251,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; - if (rpc_pton(&init_net, fo_path, size, sap, salen) == 0) + if (rpc_pton(net, fo_path, size, sap, salen) == 0) return -EINVAL; return nlmsvc_unlock_all_by_ip(sap); @@ -317,6 +336,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) int len; struct auth_domain *dom; struct knfsd_fh fh; + struct net *net = file->f_dentry->d_sb->s_fs_info; if (size == 0) return -EINVAL; @@ -352,7 +372,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) if (!dom) return -ENOMEM; - len = exp_rootfh(&init_net, dom, path, &fh, maxsize); + len = exp_rootfh(net, dom, path, &fh, maxsize); auth_domain_put(dom); if (len) return len; @@ -396,7 +416,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) { char *mesg = buf; int rv; - struct net *net = &init_net; + struct net *net = file->f_dentry->d_sb->s_fs_info; if (size > 0) { int newthreads; @@ -447,7 +467,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) int len; int npools; int *nthreads; - struct net *net = &init_net; + struct net *net = file->f_dentry->d_sb->s_fs_info; mutex_lock(&nfsd_mutex); npools = nfsd_nrpools(net); @@ -510,7 +530,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) unsigned minor; ssize_t tlen = 0; char *sep; - struct net *net = &init_net; + struct net *net = file->f_dentry->d_sb->s_fs_info; struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (size>0) { @@ -534,7 +554,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) else num = simple_strtol(vers, &minorp, 0); if (*minorp == '.') { - if (num < 4) + if (num != 4) return -EINVAL; minor = simple_strtoul(minorp+1, NULL, 0); if (minor == 0) @@ -792,7 +812,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size, static ssize_t write_ports(struct file *file, char *buf, size_t size) { ssize_t rv; - struct net *net = &init_net; + struct net *net = file->f_dentry->d_sb->s_fs_info; mutex_lock(&nfsd_mutex); rv = __write_ports(file, buf, size, net); @@ -827,7 +847,7 @@ int nfsd_max_blksize; static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) { char *mesg = buf; - struct net *net = &init_net; + struct net *net = file->f_dentry->d_sb->s_fs_info; struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (size > 0) { @@ -923,7 +943,8 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, */ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) { - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct net *net = file->f_dentry->d_sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn); } @@ -939,7 +960,8 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) */ static ssize_t write_gracetime(struct file *file, char *buf, size_t size) { - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct net *net = file->f_dentry->d_sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn); } @@ -995,7 +1017,8 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) { ssize_t rv; - struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct net *net = file->f_dentry->d_sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); mutex_lock(&nfsd_mutex); rv = __write_recoverydir(file, buf, size, nn); @@ -1013,7 +1036,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) static int nfsd_fill_super(struct super_block * sb, void * data, int silent) { static struct tree_descr nfsd_files[] = { - [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, + [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_operations, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", @@ -1037,21 +1060,37 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) #endif /* last one */ {""} }; - return simple_fill_super(sb, 0x6e667364, nfsd_files); + struct net *net = data; + int ret; + + ret = simple_fill_super(sb, 0x6e667364, nfsd_files); + if (ret) + return ret; + sb->s_fs_info = get_net(net); + return 0; } static struct dentry *nfsd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_single(fs_type, flags, data, nfsd_fill_super); + return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super); +} + +static void nfsd_umount(struct super_block *sb) +{ + struct net *net = sb->s_fs_info; + + kill_litter_super(sb); + put_net(net); } static struct file_system_type nfsd_fs_type = { .owner = THIS_MODULE, .name = "nfsd", .mount = nfsd_mount, - .kill_sb = kill_litter_super, + .kill_sb = nfsd_umount, }; +MODULE_ALIAS_FS("nfsd"); #ifdef CONFIG_PROC_FS static int create_proc_exports_entry(void) @@ -1061,7 +1100,8 @@ static int create_proc_exports_entry(void) entry = proc_mkdir("fs/nfs", NULL); if (!entry) return -ENOMEM; - entry = proc_create("exports", 0, entry, &exports_operations); + entry = proc_create("exports", 0, entry, + &exports_proc_operations); if (!entry) return -ENOMEM; return 0; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index de23db255c69..07a473fd49bc 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -56,8 +56,8 @@ extern struct svc_version nfsd_version2, nfsd_version3, extern u32 nfsd_supported_minorversion; extern struct mutex nfsd_mutex; extern spinlock_t nfsd_drc_lock; -extern unsigned int nfsd_drc_max_mem; -extern unsigned int nfsd_drc_mem_used; +extern unsigned long nfsd_drc_max_mem; +extern unsigned long nfsd_drc_mem_used; extern const struct seq_operations nfs_exports_op; @@ -106,7 +106,7 @@ static inline int nfsd_v4client(struct svc_rqst *rq) * NFSv4 State */ #ifdef CONFIG_NFSD_V4 -extern unsigned int max_delegations; +extern unsigned long max_delegations; void nfs4_state_init(void); int nfsd4_init_slabs(void); void nfsd4_free_slabs(void); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index aad6d457b9e8..54c6b3d3cc79 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -26,17 +26,13 @@ static __be32 nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp) { if (err) return err; - return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt, - resp->fh.fh_dentry, - &resp->stat)); + return fh_getattr(&resp->fh, &resp->stat); } static __be32 nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp) { if (err) return err; - return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt, - resp->fh.fh_dentry, - &resp->stat)); + return fh_getattr(&resp->fh, &resp->stat); } /* * Get a file's attributes @@ -150,9 +146,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp, &resp->count); if (nfserr) return nfserr; - return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt, - resp->fh.fh_dentry, - &resp->stat)); + return fh_getattr(&resp->fh, &resp->stat); } /* diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index cee62ab9d4a3..262df5ccbf59 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -59,8 +59,8 @@ DEFINE_MUTEX(nfsd_mutex); * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. */ spinlock_t nfsd_drc_lock; -unsigned int nfsd_drc_max_mem; -unsigned int nfsd_drc_mem_used; +unsigned long nfsd_drc_max_mem; +unsigned long nfsd_drc_mem_used; #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; @@ -342,7 +342,7 @@ static void set_max_drc(void) >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; nfsd_drc_mem_used = 0; spin_lock_init(&nfsd_drc_lock); - dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem); + dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem); } static int nfsd_get_default_max_blksize(void) @@ -652,7 +652,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) /* Check whether we have this call in the cache. */ switch (nfsd_cache_lookup(rqstp)) { - case RC_INTR: case RC_DROPIT: return 0; case RC_REPLY: @@ -703,8 +702,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) int nfsd_pool_stats_open(struct inode *inode, struct file *file) { int ret; - struct net *net = &init_net; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id); mutex_lock(&nfsd_mutex); if (nn->nfsd_serv == NULL) { @@ -721,7 +719,7 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file) int nfsd_pool_stats_release(struct inode *inode, struct file *file) { int ret = seq_release(inode, file); - struct net *net = &init_net; + struct net *net = inode->i_sb->s_fs_info; mutex_lock(&nfsd_mutex); /* this function really, really should have been called svc_put() */ diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 979b42106979..9c769a47ac5a 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -4,6 +4,7 @@ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ +#include "vfs.h" #include "xdr.h" #include "auth.h" @@ -100,12 +101,14 @@ decode_sattr(__be32 *p, struct iattr *iap) iap->ia_mode = tmp; } if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_valid |= ATTR_UID; - iap->ia_uid = tmp; + iap->ia_uid = make_kuid(&init_user_ns, tmp); + if (uid_valid(iap->ia_uid)) + iap->ia_valid |= ATTR_UID; } if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_valid |= ATTR_GID; - iap->ia_gid = tmp; + iap->ia_gid = make_kgid(&init_user_ns, tmp); + if (gid_valid(iap->ia_gid)) + iap->ia_valid |= ATTR_GID; } if ((tmp = ntohl(*p++)) != (u32)-1) { iap->ia_valid |= ATTR_SIZE; @@ -151,8 +154,8 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, *p++ = htonl(nfs_ftypes[type >> 12]); *p++ = htonl((u32) stat->mode); *p++ = htonl((u32) stat->nlink); - *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); - *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); + *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); + *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) { *p++ = htonl(NFS_MAXPATHLEN); @@ -194,11 +197,9 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, } /* Helper function for NFSv2 ACL code */ -__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) { - struct kstat stat; - vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, &stat); - return encode_fattr(rqstp, p, fhp, &stat); + return encode_fattr(rqstp, p, fhp, stat); } /* diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index d1c229feed52..1a8c7391f7ae 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -152,8 +152,8 @@ struct nfsd4_channel_attrs { struct nfsd4_cb_sec { u32 flavor; /* (u32)(-1) used to mean "no valid flavor" */ - u32 uid; - u32 gid; + kuid_t uid; + kgid_t gid; }; struct nfsd4_create_session { diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d586117fa94a..2b2e2396a869 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -401,8 +401,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, /* Revoke setuid/setgid on chown */ if (!S_ISDIR(inode->i_mode) && - (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || - ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) { + (((iap->ia_valid & ATTR_UID) && !uid_eq(iap->ia_uid, inode->i_uid)) || + ((iap->ia_valid & ATTR_GID) && !gid_eq(iap->ia_gid, inode->i_gid)))) { iap->ia_valid |= ATTR_KILL_PRIV; if (iap->ia_valid & ATTR_MODE) { /* we're setting mode too, just clear the s*id bits */ @@ -979,7 +979,7 @@ static void kill_suid(struct dentry *dentry) */ static int wait_for_concurrent_writes(struct file *file) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); static ino_t last_ino; static dev_t last_dev; int err = 0; @@ -1013,6 +1013,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int host_err; int stable = *stablep; int use_wgather; + loff_t pos = offset; dentry = file->f_path.dentry; inode = dentry->d_inode; @@ -1025,7 +1026,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, /* Write the data. */ oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); + host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos); set_fs(oldfs); if (host_err < 0) goto out_nfserr; @@ -1070,7 +1071,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) return err; - inode = file->f_path.dentry->d_inode; + inode = file_inode(file); /* Get readahead parameters */ ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); @@ -1205,7 +1206,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, * send along the gid on create when it tries to implement * setgid directories via NFS: */ - if (current_fsuid() != 0) + if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) iap->ia_valid &= ~(ATTR_UID|ATTR_GID); if (iap->ia_valid) return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); @@ -1957,7 +1958,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, offset = *offsetp; while (1) { - struct inode *dir_inode = file->f_path.dentry->d_inode; + struct inode *dir_inode = file_inode(file); unsigned int reclen; cdp->err = nfserr_eof; /* will be cleared on successful read */ @@ -2150,7 +2151,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, * with NFSv3. */ if ((acc & NFSD_MAY_OWNER_OVERRIDE) && - inode->i_uid == current_fsuid()) + uid_eq(inode->i_uid, current_fsuid())) return 0; /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 359594c393d2..5b5894159f22 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -6,6 +6,7 @@ #define LINUX_NFSD_VFS_H #include "nfsfh.h" +#include "nfsd.h" /* * Flags for nfsd_permission @@ -125,4 +126,11 @@ static inline void fh_drop_write(struct svc_fh *fh) } } +static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat) +{ + struct path p = {.mnt = fh->fh_export->ex_path.mnt, + .dentry = fh->fh_dentry}; + return nfserrno(vfs_getattr(&p, stat)); +} + #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 53b1863dd8f6..4f0481d63804 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -167,7 +167,7 @@ int nfssvc_encode_entry(void *, const char *name, int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); /* Helper functions for NFSv2 ACL code */ -__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp); +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat); __be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); #endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 7df980eb0562..b6d5542a4ac8 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -136,6 +136,7 @@ struct nfsd3_accessres { __be32 status; struct svc_fh fh; __u32 access; + struct kstat stat; }; struct nfsd3_readlinkres { @@ -225,6 +226,7 @@ struct nfsd3_getaclres { int mask; struct posix_acl *acl_access; struct posix_acl *acl_default; + struct kstat stat; }; /* dummy type for release */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 0889bfb43dc9..546f8983ecf1 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -563,7 +563,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - struct dentry *dentry, __be32 *buffer, int *countp, + struct dentry *dentry, __be32 **buffer, int countp, u32 *bmval, struct svc_rqst *, int ignore_crossmnt); extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 251da07b2a1d..80da8eb27393 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -1,6 +1,5 @@ config NILFS2_FS - tristate "NILFS2 file system support (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "NILFS2 file system support" select CRC32 help NILFS2 is a log-structured file system (LFS) supporting continuous diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index df1a7fb238d1..f30b017740a7 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -259,7 +259,7 @@ static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { loff_t pos = filp->f_pos; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 61946883025c..08fdb77852ac 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -67,7 +67,7 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); struct nilfs_transaction_info ti; int ret = 0; @@ -126,7 +126,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) nilfs_transaction_commit(inode->i_sb); mapped: - wait_on_page_writeback(page); + wait_for_stable_page(page); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index fdb180769485..b44bdb291b84 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -664,8 +664,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, if (ret < 0) printk(KERN_ERR "NILFS: GC failed during preparation: " "cannot read source blocks: err=%d\n", ret); - else + else { + if (nilfs_sb_need_update(nilfs)) + set_nilfs_discontinued(nilfs); ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + } nilfs_remove_all_gcinodes(nilfs); clear_nilfs_gc_running(nilfs); @@ -793,7 +796,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); void __user *argp = (void __user *)arg; switch (cmd) { diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 1d0c0b84c5a3..9de78f08989e 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -517,11 +517,11 @@ static int nilfs_encode_fh(struct inode *inode, __u32 *fh, int *lenp, if (parent && *lenp < NILFS_FID_SIZE_CONNECTABLE) { *lenp = NILFS_FID_SIZE_CONNECTABLE; - return 255; + return FILEID_INVALID; } if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE) { *lenp = NILFS_FID_SIZE_NON_CONNECTABLE; - return 255; + return FILEID_INVALID; } fid->cno = root->cno; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 3c991dc84f2f..c7d1f9f18b09 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1361,6 +1361,7 @@ struct file_system_type nilfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("nilfs2"); static void nilfs_inode_init_once(void *obj) { diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 08b886f119ce..2bfe6dc413a0 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -174,7 +174,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) struct dnotify_struct **prev; struct inode *inode; - inode = filp->f_path.dentry->d_inode; + inode = file_inode(filp); if (!S_ISDIR(inode->i_mode)) return; @@ -296,7 +296,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) } /* dnotify only works on directories */ - inode = filp->f_path.dentry->d_inode; + inode = file_inode(filp); if (!S_ISDIR(inode->i_mode)) { error = -ENOTDIR; goto out_err; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9ff4a5ee6e20..5d8444268a16 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -466,7 +466,7 @@ static int fanotify_find_path(int dfd, const char __user *filename, ret = -ENOTDIR; if ((flags & FAN_MARK_ONLYDIR) && - !(S_ISDIR(f.file->f_path.dentry->d_inode->i_mode))) { + !(S_ISDIR(file_inode(f.file)->i_mode))) { fdput(f); goto out; } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 6baadb5a8430..4bb21d67d9b1 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -52,7 +52,6 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) void __fsnotify_update_child_dentry_flags(struct inode *inode) { struct dentry *alias; - struct hlist_node *p; int watched; if (!S_ISDIR(inode->i_mode)) @@ -64,7 +63,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ - hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(alias, &inode->i_dentry, d_alias) { struct dentry *child; /* run all of the children of the original inode and fix their diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index f31e90fc050d..74825be65b7b 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -36,12 +36,11 @@ static void fsnotify_recalc_inode_mask_locked(struct inode *inode) { struct fsnotify_mark *mark; - struct hlist_node *pos; __u32 new_mask = 0; assert_spin_locked(&inode->i_lock); - hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) + hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) new_mask |= mark->mask; inode->i_fsnotify_mask = new_mask; } @@ -87,11 +86,11 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) void fsnotify_clear_marks_by_inode(struct inode *inode) { struct fsnotify_mark *mark, *lmark; - struct hlist_node *pos, *n; + struct hlist_node *n; LIST_HEAD(free_list); spin_lock(&inode->i_lock); - hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) { + hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, i.i_list) { list_add(&mark->i.free_i_list, &free_list); hlist_del_init_rcu(&mark->i.i_list); fsnotify_get_mark(mark); @@ -129,11 +128,10 @@ static struct fsnotify_mark *fsnotify_find_inode_mark_locked( struct inode *inode) { struct fsnotify_mark *mark; - struct hlist_node *pos; assert_spin_locked(&inode->i_lock); - hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) { + hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) { if (mark->group == group) { fsnotify_get_mark(mark); return mark; @@ -194,8 +192,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, int allow_dups) { - struct fsnotify_mark *lmark; - struct hlist_node *node, *last = NULL; + struct fsnotify_mark *lmark, *last = NULL; int ret = 0; mark->flags |= FSNOTIFY_MARK_FLAG_INODE; @@ -214,8 +211,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, } /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) { - last = node; + hlist_for_each_entry(lmark, &inode->i_fsnotify_marks, i.i_list) { + last = lmark; if ((lmark->group == group) && !allow_dups) { ret = -EEXIST; @@ -235,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(last, &mark->i.i_list); + hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list); out: fsnotify_recalc_inode_mask_locked(inode); spin_unlock(&inode->i_lock); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 871569c7d609..4216308b81b4 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -197,7 +197,6 @@ static void inotify_free_group_priv(struct fsnotify_group *group) { /* ideally the idr is empty and we won't hit the BUG in the callback */ idr_for_each(&group->inotify_data.idr, idr_callback, group); - idr_remove_all(&group->inotify_data.idr); idr_destroy(&group->inotify_data.idr); atomic_dec(&group->inotify_data.user->inotify_devs); free_uid(group->inotify_data.user); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 228a2c2ad8d7..e0f7c1241a6a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -364,22 +364,20 @@ static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, { int ret; - do { - if (unlikely(!idr_pre_get(idr, GFP_KERNEL))) - return -ENOMEM; + idr_preload(GFP_KERNEL); + spin_lock(idr_lock); - spin_lock(idr_lock); - ret = idr_get_new_above(idr, i_mark, *last_wd + 1, - &i_mark->wd); + ret = idr_alloc(idr, i_mark, *last_wd + 1, 0, GFP_NOWAIT); + if (ret >= 0) { /* we added the mark to the idr, take a reference */ - if (!ret) { - *last_wd = i_mark->wd; - fsnotify_get_mark(&i_mark->fsn_mark); - } - spin_unlock(idr_lock); - } while (ret == -EAGAIN); + i_mark->wd = ret; + *last_wd = i_mark->wd; + fsnotify_get_mark(&i_mark->fsn_mark); + } - return ret; + spin_unlock(idr_lock); + idr_preload_end(); + return ret < 0 ? ret : 0; } static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group, @@ -576,8 +574,6 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); - if (unlikely(!(mask & IN_ALL_EVENTS))) - return -EINVAL; fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) @@ -629,8 +625,6 @@ static int inotify_new_watch(struct fsnotify_group *group, /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); - if (unlikely(!(mask & IN_ALL_EVENTS))) - return -EINVAL; tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); if (unlikely(!tmp_i_mark)) diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 4df58b8ea64a..68ca5a8704b5 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -33,12 +33,12 @@ void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { struct fsnotify_mark *mark, *lmark; - struct hlist_node *pos, *n; + struct hlist_node *n; struct mount *m = real_mount(mnt); LIST_HEAD(free_list); spin_lock(&mnt->mnt_root->d_lock); - hlist_for_each_entry_safe(mark, pos, n, &m->mnt_fsnotify_marks, m.m_list) { + hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, m.m_list) { list_add(&mark->m.free_m_list, &free_list); hlist_del_init_rcu(&mark->m.m_list); fsnotify_get_mark(mark); @@ -71,12 +71,11 @@ static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt) { struct mount *m = real_mount(mnt); struct fsnotify_mark *mark; - struct hlist_node *pos; __u32 new_mask = 0; assert_spin_locked(&mnt->mnt_root->d_lock); - hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) + hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) new_mask |= mark->mask; m->mnt_fsnotify_mask = new_mask; } @@ -114,11 +113,10 @@ static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_ { struct mount *m = real_mount(mnt); struct fsnotify_mark *mark; - struct hlist_node *pos; assert_spin_locked(&mnt->mnt_root->d_lock); - hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) { + hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) { if (mark->group == group) { fsnotify_get_mark(mark); return mark; @@ -153,8 +151,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, int allow_dups) { struct mount *m = real_mount(mnt); - struct fsnotify_mark *lmark; - struct hlist_node *node, *last = NULL; + struct fsnotify_mark *lmark, *last = NULL; int ret = 0; mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; @@ -173,8 +170,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, } /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, node, &m->mnt_fsnotify_marks, m.m_list) { - last = node; + hlist_for_each_entry(lmark, &m->mnt_fsnotify_marks, m.m_list) { + last = lmark; if ((lmark->group == group) && !allow_dups) { ret = -EEXIST; @@ -194,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(last, &mark->m.m_list); + hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list); out: fsnotify_recalc_vfsmount_mask_locked(mnt); spin_unlock(&mnt->mnt_root->d_lock); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 99e36107ff60..aa411c3f20e9 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1101,7 +1101,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; loff_t fpos, i_size; - struct inode *bmp_vi, *vdir = filp->f_path.dentry->d_inode; + struct inode *bmp_vi, *vdir = file_inode(filp); struct super_block *sb = vdir->i_sb; ntfs_inode *ndir = NTFS_I(vdir); ntfs_volume *vol = NTFS_SB(sb); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 4a8289f8b16c..82650d52d916 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3079,6 +3079,7 @@ static struct file_system_type ntfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ntfs"); /* Stable names for the slab caches. */ static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache"; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 260b16281fc3..8a404576fb26 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -65,7 +65,20 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size) acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); - acl->a_entries[n].e_id = le32_to_cpu(entry->e_id); + switch(acl->a_entries[n].e_tag) { + case ACL_USER: + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + default: + break; + } value += sizeof(struct posix_acl_entry); } @@ -91,7 +104,21 @@ static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size) for (n = 0; n < acl->a_count; n++, entry++) { entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + switch(acl->a_entries[n].e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, + acl->a_entries[n].e_uid)); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, + acl->a_entries[n].e_gid)); + break; + default: + entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } } return ocfs2_acl; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 31b9463fba1f..b8a9d87231b1 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6751,8 +6751,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, mlog_errno(ret); out: - if (pages) - kfree(pages); + kfree(pages); return ret; } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 657743254eb9..20dfec72e903 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -569,7 +569,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, int ret, bool is_async) { - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(iocb->ki_filp); int level; wait_queue_head_t *wq = ocfs2_ioend_wq(inode); @@ -593,9 +593,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, level = ocfs2_iocb_rw_locked_level(iocb); ocfs2_rw_unlock(inode, level); + inode_dio_done(inode); if (is_async) aio_complete(iocb, ret, 0); - inode_dio_done(inode); } /* @@ -626,7 +626,7 @@ static ssize_t ocfs2_direct_IO(int rw, unsigned long nr_segs) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; + struct inode *inode = file_inode(file)->i_mapping->host; /* * Fallback to buffered I/O if we see an inode without @@ -1194,6 +1194,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, goto out; } } + wait_for_stable_page(wc->w_pages[i]); if (index == target_index) wc->w_target_page = wc->w_pages[i]; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f7c648d7d6bf..42252bf64b51 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1471,8 +1471,7 @@ static void o2hb_region_release(struct config_item *item) mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); - if (reg->hr_tmp_block) - kfree(reg->hr_tmp_block); + kfree(reg->hr_tmp_block); if (reg->hr_slot_data) { for (i = 0; i < reg->hr_num_pages; i++) { @@ -1486,8 +1485,7 @@ static void o2hb_region_release(struct config_item *item) if (reg->hr_bdev) blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); - if (reg->hr_slots) - kfree(reg->hr_slots); + kfree(reg->hr_slots); kfree(reg->hr_db_regnum); kfree(reg->hr_db_livenodes); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1bfe8802cc1e..aa88bd8bcedc 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -304,28 +304,22 @@ static u8 o2net_num_from_nn(struct o2net_node *nn) static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw) { - int ret = 0; - - do { - if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) { - ret = -EAGAIN; - break; - } - spin_lock(&nn->nn_lock); - ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id); - if (ret == 0) - list_add_tail(&nsw->ns_node_item, - &nn->nn_status_list); - spin_unlock(&nn->nn_lock); - } while (ret == -EAGAIN); + int ret; - if (ret == 0) { - init_waitqueue_head(&nsw->ns_wq); - nsw->ns_sys_status = O2NET_ERR_NONE; - nsw->ns_status = 0; + spin_lock(&nn->nn_lock); + ret = idr_alloc(&nn->nn_status_idr, nsw, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + nsw->ns_id = ret; + list_add_tail(&nsw->ns_node_item, &nn->nn_status_list); } + spin_unlock(&nn->nn_lock); + if (ret < 0) + return ret; - return ret; + init_waitqueue_head(&nsw->ns_wq); + nsw->ns_sys_status = O2NET_ERR_NONE; + nsw->ns_status = 0; + return 0; } static void o2net_complete_nsw_locked(struct o2net_node *nn, @@ -870,7 +864,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, /* we've had some trouble with handlers seemingly vanishing. */ mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p, &parent) == NULL, - "couldn't find handler we *just* registerd " + "couldn't find handler we *just* registered " "for type %u key %08x\n", msg_type, key); } write_unlock(&o2net_handler_lock); @@ -1165,10 +1159,8 @@ out: o2net_debug_del_nst(&nst); /* must be before dropping sc and node */ if (sc) sc_put(sc); - if (vec) - kfree(vec); - if (msg) - kfree(msg); + kfree(vec); + kfree(msg); o2net_complete_nsw(nn, &nsw, 0, 0, 0); return ret; } diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 8db4b58b2e4b..ef999729e274 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -169,11 +169,10 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed) { - struct hlist_node *p; struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { trace_ocfs2_find_local_alias(dentry->d_name.len, diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8fe4e2892ab9..f1e1aed8f638 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -67,7 +67,6 @@ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) static unsigned char ocfs2_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK @@ -2015,12 +2014,12 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) { int error = 0; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); int lock_level = 0; trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); - error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); + error = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level); if (lock_level && error >= 0) { /* We release EX lock which used to update atime * and get PR lock again to reduce contention diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 9e89d70df337..dbb17c07656a 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -319,9 +319,7 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) if (dlm->master_hash) dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); - if (dlm->name) - kfree(dlm->name); - + kfree(dlm->name); kfree(dlm); } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 005261c333b0..33ecbe0e6734 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2020,7 +2020,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, int ignore_higher, u8 request_from, u32 flags) { struct dlm_work_item *item; - item = kzalloc(sizeof(*item), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_ATOMIC); if (!item) return -ENOMEM; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 01ebfd0bdad7..eeac97bb3bfa 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2083,7 +2083,6 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, u8 dead_node, u8 new_master) { int i; - struct hlist_node *hash_iter; struct hlist_head *bucket; struct dlm_lock_resource *res, *next; @@ -2114,7 +2113,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, * if necessary */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); - hlist_for_each_entry(res, hash_iter, bucket, hash_node) { + hlist_for_each_entry(res, bucket, hash_node) { if (!(res->state & DLM_LOCK_RES_RECOVERING)) continue; @@ -2273,7 +2272,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) { - struct hlist_node *iter; struct dlm_lock_resource *res; int i; struct hlist_head *bucket; @@ -2299,7 +2297,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); - hlist_for_each_entry(res, iter, bucket, hash_node) { + hlist_for_each_entry(res, bucket, hash_node) { /* always prune any $RECOVERY entries for dead nodes, * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 16b712d260d4..12bafb7265ce 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -224,7 +224,7 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait) { int event = 0; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct dlmfs_inode_private *ip = DLMFS_I(inode); poll_wait(file, &ip->ip_lockres.l_event, wait); @@ -245,7 +245,7 @@ static ssize_t dlmfs_file_read(struct file *filp, int bytes_left; ssize_t readlen, got; char *lvb_buf; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", inode->i_ino, count, *ppos); @@ -293,7 +293,7 @@ static ssize_t dlmfs_file_write(struct file *filp, int bytes_left; ssize_t writelen; char *lvb_buf; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", inode->i_ino, count, *ppos); @@ -640,6 +640,7 @@ static struct file_system_type dlmfs_fs_type = { .mount = dlmfs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("ocfs2_dlmfs"); static int __init init_dlmfs_fs(void) { diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 4f7795fb5fc0..12ae194ac943 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2045,8 +2045,8 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb->lvb_version = OCFS2_LVB_VERSION; lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); - lvb->lvb_iuid = cpu_to_be32(inode->i_uid); - lvb->lvb_igid = cpu_to_be32(inode->i_gid); + lvb->lvb_iuid = cpu_to_be32(i_uid_read(inode)); + lvb->lvb_igid = cpu_to_be32(i_gid_read(inode)); lvb->lvb_imode = cpu_to_be16(inode->i_mode); lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); lvb->lvb_iatime_packed = @@ -2095,8 +2095,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) else inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_uid = be32_to_cpu(lvb->lvb_iuid); - inode->i_gid = be32_to_cpu(lvb->lvb_igid); + i_uid_write(inode, be32_to_cpu(lvb->lvb_iuid)); + i_gid_write(inode, be32_to_cpu(lvb->lvb_igid)); inode->i_mode = be16_to_cpu(lvb->lvb_imode); set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); ocfs2_unpack_timespec(&inode->i_atime, @@ -2545,6 +2545,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb, * everything is up to the caller :) */ status = ocfs2_should_refresh_lock_res(lockres); if (status < 0) { + ocfs2_cluster_unlock(osb, lockres, level); mlog_errno(status); goto bail; } @@ -2553,8 +2554,10 @@ int ocfs2_super_lock(struct ocfs2_super *osb, ocfs2_complete_lock_res_refresh(lockres, status); - if (status < 0) + if (status < 0) { + ocfs2_cluster_unlock(osb, lockres, level); mlog_errno(status); + } ocfs2_track_lock_refresh(lockres); } bail: diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 322216a5f0dd..29651167190d 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -195,11 +195,11 @@ static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len, if (parent && (len < 6)) { *max_len = 6; - type = 255; + type = FILEID_INVALID; goto bail; } else if (len < 3) { *max_len = 3; - type = 255; + type = FILEID_INVALID; goto bail; } diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index f487aa343442..1c39efb71bab 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -282,8 +282,7 @@ search: spin_unlock(&oi->ip_lock); out: - if (new_emi) - kfree(new_emi); + kfree(new_emi); } static int ocfs2_last_eb_is_empty(struct inode *inode, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 37d313ede159..6474cb44004d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1116,7 +1116,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) (unsigned long long)OCFS2_I(inode)->ip_blkno, dentry->d_name.len, dentry->d_name.name, attr->ia_valid, attr->ia_mode, - attr->ia_uid, attr->ia_gid); + from_kuid(&init_user_ns, attr->ia_uid), + from_kgid(&init_user_ns, attr->ia_gid)); /* ensuring we don't even attempt to truncate a symlink */ if (S_ISLNK(inode->i_mode)) @@ -1174,14 +1175,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { /* * Gather pointers to quota structures so that allocation / * freeing of quota structures happens here and not inside * dquot_transfer() where we have problems with lock ordering */ - if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid + if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid) && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid)); @@ -1190,7 +1191,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) goto bail_unlock; } } - if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid + if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid) && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid)); @@ -1949,7 +1950,7 @@ out: int ocfs2_change_file_space(struct file *file, unsigned int cmd, struct ocfs2_space_resv *sr) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int ret; @@ -1977,7 +1978,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_space_resv sr; int change_size = 1; @@ -2232,7 +2233,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, loff_t old_size, *ppos = &iocb->ki_pos; u32 old_clusters; struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); @@ -2516,7 +2517,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in, unsigned int flags) { int ret = 0, lock_level = 0; - struct inode *inode = in->f_path.dentry->d_inode; + struct inode *inode = file_inode(in); trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2526,7 +2527,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in, /* * See the comment in ocfs2_file_aio_read() */ - ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level); + ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level); if (ret < 0) { mlog_errno(ret); goto bail; @@ -2546,7 +2547,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, { int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2589,7 +2590,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, * like i_size. This allows the checks down below * generic_file_aio_read() a chance of actually working. */ - ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); + ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level); if (ret < 0) { mlog_errno(ret); goto bail; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index d89e08a81eda..f87f9bd1edff 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -269,8 +269,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_generation = le32_to_cpu(fe->i_generation); inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); inode->i_mode = le16_to_cpu(fe->i_mode); - inode->i_uid = le32_to_cpu(fe->i_uid); - inode->i_gid = le32_to_cpu(fe->i_gid); + i_uid_write(inode, le32_to_cpu(fe->i_uid)); + i_gid_write(inode, le32_to_cpu(fe->i_gid)); /* Fast symlinks will have i_size but no allocated clusters. */ if (S_ISLNK(inode->i_mode) && !fe->i_clusters) { @@ -1259,8 +1259,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_size = cpu_to_le64(i_size_read(inode)); ocfs2_set_links_count(fe, inode->i_nlink); - fe->i_uid = cpu_to_le32(inode->i_uid); - fe->i_gid = cpu_to_le32(inode->i_gid); + fe->i_uid = cpu_to_le32(i_uid_read(inode)); + fe->i_gid = cpu_to_le32(i_gid_read(inode)); fe->i_mode = cpu_to_le16(inode->i_mode); fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); @@ -1290,8 +1290,8 @@ void ocfs2_refresh_inode(struct inode *inode, ocfs2_set_inode_flags(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); set_nlink(inode, ocfs2_read_links_count(fe)); - inode->i_uid = le32_to_cpu(fe->i_uid); - inode->i_gid = le32_to_cpu(fe->i_gid); + i_uid_write(inode, le32_to_cpu(fe->i_uid)); + i_gid_write(inode, le32_to_cpu(fe->i_gid)); inode->i_mode = le16_to_cpu(fe->i_mode); if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) inode->i_blocks = 0; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index f20edcbfe700..752f0b26221d 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -881,7 +881,7 @@ bail: long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); unsigned int flags; int new_clusters; int status; @@ -994,7 +994,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) { bool preserve; struct reflink_arguments args; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct ocfs2_info info; void __user *argp = (void __user *)arg; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 2dd36af79e26..8eccfabcd12e 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1234,11 +1234,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, /* Though we wish to avoid it, we are in fact safe in * skipping local alloc cleanup as fsck.ocfs2 is more * than capable of reclaiming unused space. */ - if (la_dinode) - kfree(la_dinode); - - if (tl_dinode) - kfree(tl_dinode); + kfree(la_dinode); + kfree(tl_dinode); if (qrec) ocfs2_free_quota_recovery(qrec); @@ -1408,8 +1405,7 @@ bail: mutex_unlock(&osb->recovery_lock); - if (rm_quota) - kfree(rm_quota); + kfree(rm_quota); /* no one is callint kthread_stop() for us so the kthread() api * requires that we call do_exit(). And it isn't exported, but diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index a9f78c74d687..aebeacd807c3 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -476,8 +476,7 @@ out: if (local_alloc_inode) iput(local_alloc_inode); - if (alloc_copy) - kfree(alloc_copy); + kfree(alloc_copy); } /* @@ -534,7 +533,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, mlog_errno(status); bail: - if ((status < 0) && (*alloc_copy)) { + if (status < 0) { kfree(*alloc_copy); *alloc_copy = NULL; } @@ -1290,8 +1289,7 @@ bail: if (main_bm_inode) iput(main_bm_inode); - if (alloc_copy) - kfree(alloc_copy); + kfree(alloc_copy); if (ac) ocfs2_free_alloc_context(ac); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 47a87dda54ce..10d66c75cecb 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -62,7 +62,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, struct page *page) { int ret = VM_FAULT_NOPAGE; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); unsigned int len = PAGE_CACHE_SIZE; @@ -131,7 +131,7 @@ out: static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); struct buffer_head *di_bh = NULL; sigset_t oldset; int ret; @@ -180,13 +180,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) { int ret = 0, lock_level = 0; - ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode, - file->f_vfsmnt, &lock_level); + ret = ocfs2_inode_lock_atime(file_inode(file), + file->f_path.mnt, &lock_level); if (ret < 0) { mlog_errno(ret); goto out; } - ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level); + ocfs2_inode_unlock(file_inode(file), lock_level); out: vma->vm_ops = &ocfs2_file_vm_ops; return 0; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 6083432f667e..9f8dcadd9a50 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -1055,7 +1055,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) { int status; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct ocfs2_move_extents range; struct ocfs2_move_extents_context *context = NULL; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f1fd0741162b..04ee1b57c243 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -512,8 +512,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_suballoc_loc = cpu_to_le64(suballoc_loc); fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); - fe->i_uid = cpu_to_le32(inode->i_uid); - fe->i_gid = cpu_to_le32(inode->i_gid); + fe->i_uid = cpu_to_le32(i_uid_read(inode)); + fe->i_gid = cpu_to_le32(i_gid_read(inode)); fe->i_mode = cpu_to_le16(inode->i_mode); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 30a055049e16..998b17eda09d 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2927,7 +2927,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, u32 new_cluster, u32 new_len) { int ret = 0, partial; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct ocfs2_caching_info *ci = INODE_CACHE(inode); struct super_block *sb = ocfs2_metadata_cache_get_super(ci); u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); @@ -3020,7 +3020,7 @@ int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, u32 new_cluster, u32 new_len) { int ret = 0; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct ocfs2_caching_info *ci = INODE_CACHE(inode); int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); @@ -4407,7 +4407,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, * rights to do so. */ if (preserve) { - if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN)) + if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN)) return -EPERM; if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN)) return -EPERM; diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 94368017edb3..bf1f8930456f 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -376,7 +376,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); out_free: - if (rc && conn->cc_private) + if (rc) kfree(conn->cc_private); out: diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index f169da4624fd..b7e74b580c0f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -642,7 +642,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle, * cluster groups will be staying in cache for the duration of * this operation. */ - ac->ac_allow_chain_relink = 0; + ac->ac_disable_chain_relink = 1; /* Claim the first region */ status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, @@ -1823,7 +1823,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, * Do this *after* figuring out how many bits we're taking out * of our target group. */ - if (ac->ac_allow_chain_relink && + if (!ac->ac_disable_chain_relink && (prev_group_bh) && (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { status = ocfs2_relink_block_group(handle, alloc_inode, @@ -1928,7 +1928,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, victim = ocfs2_find_victim_chain(cl); ac->ac_chain = victim; - ac->ac_allow_chain_relink = 1; status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); @@ -1947,7 +1946,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, * searching each chain in order. Don't allow chain relinking * because we only calculate enough journal credits for one * relink per alloc. */ - ac->ac_allow_chain_relink = 0; + ac->ac_disable_chain_relink = 1; for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { if (i == victim) continue; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index b8afabfeede4..a36d0aa50911 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -49,7 +49,7 @@ struct ocfs2_alloc_context { /* these are used by the chain search */ u16 ac_chain; - int ac_allow_chain_relink; + int ac_disable_chain_relink; group_search_t *ac_group_search; u64 ac_last_group; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 0e91ec22a940..01b85165552b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1266,6 +1266,7 @@ static struct file_system_type ocfs2_fs_type = { .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; +MODULE_ALIAS_FS("ocfs2"); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options) @@ -2525,8 +2526,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) mlog_errno(status); finally: - if (local_alloc) - kfree(local_alloc); + kfree(local_alloc); if (status) mlog_errno(status); @@ -2553,8 +2553,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) * we free it here. */ kfree(osb->journal); - if (osb->local_alloc_copy) - kfree(osb->local_alloc_copy); + kfree(osb->local_alloc_copy); kfree(osb->uuid_str); ocfs2_put_dlm_debug(osb->osb_dlm_debug); memset(osb, 0, sizeof(struct ocfs2_super)); diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index f1fbb4b552ad..66edce7ecfd7 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -57,7 +57,7 @@ static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page) { struct inode *inode = page->mapping->host; - struct buffer_head *bh; + struct buffer_head *bh = NULL; int status = ocfs2_read_inode_block(inode, &bh); struct ocfs2_dinode *fe; const char *link; diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 3d635f4bbb20..f053688d22a3 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -91,8 +91,7 @@ static struct inode **get_local_system_inode(struct ocfs2_super *osb, } else osb->local_system_inodes = local_system_inodes; spin_unlock(&osb->osb_lock); - if (unlikely(free)) - kfree(free); + kfree(free); } index = (slot * NUM_LOCAL_SYSTEM_INODES) + diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 0ba9ea1e7961..2e3ea308c144 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7189,7 +7189,7 @@ int ocfs2_init_security_and_acl(struct inode *dir, struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); - if (!ret) { + if (ret) { mlog_errno(ret); goto leave; } diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index fb5b3ff79dc6..acbaebcad3a8 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -330,7 +330,7 @@ int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir, u64 fsblock, int hindex) { - struct inode *dir = filp->f_dentry->d_inode; + struct inode *dir = file_inode(filp); struct buffer_head *bh; struct omfs_inode *oi; u64 self; @@ -405,7 +405,7 @@ out: static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *dir = filp->f_dentry->d_inode; + struct inode *dir = file_inode(filp); struct buffer_head *bh; loff_t offset, res; unsigned int hchain, hindex; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 25d715c7c87a..d8b0afde2179 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -572,6 +572,7 @@ static struct file_system_type omfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("omfs"); static int __init init_omfs_fs(void) { diff --git a/fs/open.c b/fs/open.c index 9b33c0cbfacf..68354466879f 100644 --- a/fs/open.c +++ b/fs/open.c @@ -30,6 +30,7 @@ #include <linux/fs_struct.h> #include <linux/ima.h> #include <linux/dnotify.h> +#include <linux/compat.h> #include "internal.h" @@ -140,6 +141,13 @@ SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) return do_sys_truncate(path, length); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) +{ + return do_sys_truncate(path, length); +} +#endif + static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { struct inode *inode; @@ -195,6 +203,13 @@ SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) return ret; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) +{ + return do_sys_ftruncate(fd, length, 1); +} +#endif + /* LFS versions of truncate are only needed on 32 bit machines */ #if BITS_PER_LONG == 32 SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length) @@ -228,7 +243,7 @@ SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64); int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); long ret; if (offset < 0 || len <= 0) @@ -426,7 +441,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd) if (!f.file) goto out; - inode = f.file->f_path.dentry->d_inode; + inode = file_inode(f.file); error = -ENOTDIR; if (!S_ISDIR(inode->i_mode)) @@ -689,7 +704,7 @@ static int do_dentry_open(struct file *f, f->f_mode = FMODE_PATH; path_get(&f->f_path); - inode = f->f_path.dentry->d_inode; + inode = f->f_inode = f->f_path.dentry->d_inode; if (f->f_mode & FMODE_WRITE) { error = __get_file_write_access(inode, f->f_path.mnt); if (error) @@ -699,7 +714,6 @@ static int do_dentry_open(struct file *f, } f->f_mapping = inode->i_mapping; - f->f_pos = 0; file_sb_list_add(f, inode->i_sb); if (unlikely(f->f_mode & FMODE_PATH)) { @@ -753,6 +767,7 @@ cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; f->f_path.dentry = NULL; + f->f_inode = NULL; return error; } @@ -810,23 +825,22 @@ struct file *dentry_open(const struct path *path, int flags, /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); - error = -ENFILE; f = get_empty_filp(); - if (f == NULL) - return ERR_PTR(error); - - f->f_flags = flags; - f->f_path = *path; - error = do_dentry_open(f, NULL, cred); - if (!error) { - error = open_check_o_direct(f); - if (error) { - fput(f); + if (!IS_ERR(f)) { + f->f_flags = flags; + f->f_path = *path; + error = do_dentry_open(f, NULL, cred); + if (!error) { + /* from now on we need fput() to dispose of f */ + error = open_check_o_direct(f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } else { + put_filp(f); f = ERR_PTR(error); } - } else { - put_filp(f); - f = ERR_PTR(error); } return f; } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 2ad080faca34..75885ffde44e 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -262,7 +262,7 @@ found: static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct op_inode_info *oi = OP_I(inode); struct device_node *dp = oi->u.node; struct device_node *child; @@ -432,6 +432,7 @@ static struct file_system_type openprom_fs_type = { .mount = openprom_mount, .kill_sb = kill_anon_super, }; +MODULE_ALIAS_FS("openpromfs"); static void op_inode_init_once(void *data) { diff --git a/fs/pipe.c b/fs/pipe.c index bd3479db4b62..2234f3f61f8d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -361,7 +361,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct pipe_inode_info *pipe; int do_wakeup; ssize_t ret; @@ -486,7 +486,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t ppos) { struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct pipe_inode_info *pipe; ssize_t ret; int do_wakeup; @@ -677,7 +677,7 @@ bad_pipe_w(struct file *filp, const char __user *buf, size_t count, static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct pipe_inode_info *pipe; int count, buf, nrbufs; @@ -705,7 +705,7 @@ static unsigned int pipe_poll(struct file *filp, poll_table *wait) { unsigned int mask; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct pipe_inode_info *pipe = inode->i_pipe; int nrbufs; @@ -758,7 +758,7 @@ pipe_release(struct inode *inode, int decr, int decw) static int pipe_read_fasync(int fd, struct file *filp, int on) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); int retval; mutex_lock(&inode->i_mutex); @@ -772,7 +772,7 @@ pipe_read_fasync(int fd, struct file *filp, int on) static int pipe_write_fasync(int fd, struct file *filp, int on) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); int retval; mutex_lock(&inode->i_mutex); @@ -786,7 +786,7 @@ pipe_write_fasync(int fd, struct file *filp, int on) static int pipe_rdwr_fasync(int fd, struct file *filp, int on) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct pipe_inode_info *pipe = inode->i_pipe; int retval; @@ -863,6 +863,9 @@ pipe_rdwr_open(struct inode *inode, struct file *filp) { int ret = -ENOENT; + if (!(filp->f_mode & (FMODE_READ|FMODE_WRITE))) + return -EINVAL; + mutex_lock(&inode->i_mutex); if (inode->i_pipe) { @@ -1037,13 +1040,13 @@ int create_pipe_files(struct file **res, int flags) err = -ENFILE; f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); - if (!f) + if (IS_ERR(f)) goto err_dentry; f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops); - if (!res[0]) + if (IS_ERR(res[0])) goto err_file; path_get(&path); @@ -1226,7 +1229,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, */ struct pipe_inode_info *get_pipe_info(struct file *file) { - struct inode *i = file->f_path.dentry->d_inode; + struct inode *i = file_inode(file); return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; } diff --git a/fs/pnode.c b/fs/pnode.c index 3e000a51ac0d..8b29d2164da6 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -9,6 +9,7 @@ #include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/fs.h> +#include <linux/nsproxy.h> #include "internal.h" #include "pnode.h" @@ -220,6 +221,7 @@ static struct mount *get_source(struct mount *dest, int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, struct mount *source_mnt, struct list_head *tree_list) { + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct mount *m, *child; int ret = 0; struct mount *prev_dest_mnt = dest_mnt; @@ -237,6 +239,10 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(source, source->mnt.mnt_root, type); if (IS_ERR(child)) { ret = PTR_ERR(child); diff --git a/fs/pnode.h b/fs/pnode.h index 19b853a3445c..a0493d5ebfbf 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -23,6 +23,7 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 +#define CL_UNPRIVILEGED 0x40 static inline void set_mnt_shared(struct mount *mnt) { diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 981b05601931..712f24db9600 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -8,7 +8,8 @@ proc-y := nommu.o task_nommu.o proc-$(CONFIG_MMU) := mmu.o task_mmu.o proc-y += inode.o root.o base.o generic.o array.o \ - proc_tty.o fd.o + fd.o +proc-$(CONFIG_TTY) += proc_tty.o proc-y += cmdline.o proc-y += consoles.o proc-y += cpuinfo.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 6a91e6ffbcbd..f7ed9ee46eb9 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -449,7 +449,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, do { min_flt += t->min_flt; maj_flt += t->maj_flt; - gtime += t->gtime; + gtime += task_gtime(t); t = next_thread(t); } while (t != task); @@ -472,7 +472,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt = task->min_flt; maj_flt = task->maj_flt; task_cputime_adjusted(task, &utime, &stime); - gtime = task->gtime; + gtime = task_gtime(task); } /* scale priority and nice values from timeslices to -20..20 */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 9b43ff77a51e..69078c7cef1f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -73,6 +73,7 @@ #include <linux/security.h> #include <linux/ptrace.h> #include <linux/tracehook.h> +#include <linux/printk.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> @@ -383,7 +384,7 @@ static int lstats_open(struct inode *inode, struct file *file) static ssize_t lstats_write(struct file *file, const char __user *buf, size_t count, loff_t *offs) { - struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; @@ -602,7 +603,7 @@ static const struct inode_operations proc_def_inode_operations = { static ssize_t proc_info_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); unsigned long page; ssize_t length; struct task_struct *task = get_proc_task(inode); @@ -668,7 +669,7 @@ static const struct file_operations proc_single_file_operations = { static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; if (!task) @@ -869,7 +870,7 @@ static const struct file_operations proc_environ_operations = { static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; int oom_adj = OOM_ADJUST_MIN; size_t len; @@ -916,7 +917,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf, goto out; } - task = get_proc_task(file->f_path.dentry->d_inode); + task = get_proc_task(file_inode(file)); if (!task) { err = -ESRCH; goto out; @@ -952,7 +953,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf, * /proc/pid/oom_adj is provided for legacy purposes, ask users to use * /proc/pid/oom_score_adj instead. */ - printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", current->comm, task_pid_nr(current), task_pid_nr(task), task_pid_nr(task)); @@ -976,7 +977,7 @@ static const struct file_operations proc_oom_adj_operations = { static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; short oom_score_adj = OOM_SCORE_ADJ_MIN; unsigned long flags; @@ -1019,7 +1020,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto out; } - task = get_proc_task(file->f_path.dentry->d_inode); + task = get_proc_task(file_inode(file)); if (!task) { err = -ESRCH; goto out; @@ -1067,7 +1068,7 @@ static const struct file_operations proc_oom_score_adj_operations = { static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; @@ -1084,7 +1085,7 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf, static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); char *page, *tmp; ssize_t length; uid_t loginuid; @@ -1142,7 +1143,7 @@ static const struct file_operations proc_loginuid_operations = { static ssize_t proc_sessionid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; @@ -1165,7 +1166,7 @@ static const struct file_operations proc_sessionid_operations = { static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; size_t len; int make_it_fail; @@ -1197,7 +1198,7 @@ static ssize_t proc_fault_inject_write(struct file * file, make_it_fail = simple_strtol(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - task = get_proc_task(file->f_dentry->d_inode); + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; task->make_it_fail = make_it_fail; @@ -1237,7 +1238,7 @@ static ssize_t sched_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct task_struct *p; p = get_proc_task(inode); @@ -1288,7 +1289,7 @@ static ssize_t sched_autogroup_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct task_struct *p; char buffer[PROC_NUMBUF]; int nice; @@ -1343,7 +1344,7 @@ static const struct file_operations proc_pid_sched_autogroup_operations = { static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct task_struct *p; char buffer[TASK_COMM_LEN]; @@ -1711,7 +1712,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; if (!capable(CAP_SYS_ADMIN)) { - status = -EACCES; + status = -EPERM; goto out_notask; } @@ -1844,7 +1845,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, struct dentry *result; struct mm_struct *mm; - result = ERR_PTR(-EACCES); + result = ERR_PTR(-EPERM); if (!capable(CAP_SYS_ADMIN)) goto out; @@ -1900,7 +1901,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) ino_t ino; int ret; - ret = -EACCES; + ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto out; @@ -2146,7 +2147,7 @@ out_no_task: static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); char *p = NULL; ssize_t length; struct task_struct *task = get_proc_task(inode); @@ -2167,7 +2168,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); char *page; ssize_t length; struct task_struct *task = get_proc_task(inode); @@ -2256,7 +2257,7 @@ static const struct inode_operations proc_attr_dir_inode_operations = { static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; char buffer[PROC_NUMBUF]; size_t len; @@ -2308,7 +2309,7 @@ static ssize_t proc_coredump_filter_write(struct file *file, goto out_no_task; ret = -ESRCH; - task = get_proc_task(file->f_dentry->d_inode); + task = get_proc_task(file_inode(file)); if (!task) goto out_no_task; @@ -2618,6 +2619,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) name.name = buf; name.len = snprintf(buf, sizeof(buf), "%d", pid); + /* no ->d_hash() rejects on procfs */ dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { shrink_dcache_parent(dentry); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 76ddae83daa5..4b3b3ffb52f1 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/printk.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/idr.h> @@ -42,7 +43,7 @@ static ssize_t __proc_file_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); char *page; ssize_t retval=0; int eof=0; @@ -132,11 +133,8 @@ __proc_file_read(struct file *file, char __user *buf, size_t nbytes, } if (start == NULL) { - if (n > PAGE_SIZE) { - printk(KERN_ERR - "proc_file_read: Apparent buffer overflow!\n"); + if (n > PAGE_SIZE) /* Apparent buffer overflow */ n = PAGE_SIZE; - } n -= *ppos; if (n <= 0) break; @@ -144,26 +142,19 @@ __proc_file_read(struct file *file, char __user *buf, size_t nbytes, n = count; start = page + *ppos; } else if (start < page) { - if (n > PAGE_SIZE) { - printk(KERN_ERR - "proc_file_read: Apparent buffer overflow!\n"); + if (n > PAGE_SIZE) /* Apparent buffer overflow */ n = PAGE_SIZE; - } if (n > count) { /* * Don't reduce n because doing so might * cut off part of a data block. */ - printk(KERN_WARNING - "proc_file_read: Read count exceeded\n"); + pr_warn("proc_file_read: count exceeded\n"); } } else /* start >= page */ { unsigned long startoff = (unsigned long)(start - page); - if (n > (PAGE_SIZE - startoff)) { - printk(KERN_ERR - "proc_file_read: Apparent buffer overflow!\n"); + if (n > (PAGE_SIZE - startoff)) /* buffer overflow? */ n = PAGE_SIZE - startoff; - } if (n > count) n = count; } @@ -188,7 +179,7 @@ static ssize_t proc_file_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; spin_lock(&pde->pde_unload_lock); @@ -209,7 +200,7 @@ static ssize_t proc_file_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (pde->write_proc) { @@ -412,8 +403,7 @@ static const struct dentry_operations proc_dentry_operations = struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, struct dentry *dentry) { - struct inode *inode = NULL; - int error = -ENOENT; + struct inode *inode; spin_lock(&proc_subdir_lock); for (de = de->subdir; de ; de = de->next) { @@ -422,22 +412,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { pde_get(de); spin_unlock(&proc_subdir_lock); - error = -ENOMEM; inode = proc_get_inode(dir->i_sb, de); - goto out_unlock; + if (!inode) + return ERR_PTR(-ENOMEM); + d_set_d_op(dentry, &proc_dentry_operations); + d_add(dentry, inode); + return NULL; } } spin_unlock(&proc_subdir_lock); -out_unlock: - - if (inode) { - d_set_d_op(dentry, &proc_dentry_operations); - d_add(dentry, inode); - return NULL; - } - if (de) - pde_put(de); - return ERR_PTR(error); + return ERR_PTR(-ENOENT); } struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, @@ -460,7 +444,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, { unsigned int ino; int i; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); int ret = 0; ino = inode->i_ino; @@ -522,7 +506,7 @@ out: int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); return proc_readdir_de(PDE(inode), filp, dirent, filldir); } @@ -576,7 +560,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp for (tmp = dir->subdir; tmp; tmp = tmp->next) if (strcmp(tmp->name, dp->name) == 0) { - WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", + WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); break; } @@ -837,9 +821,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) if (S_ISDIR(de->mode)) parent->nlink--; de->nlink = 0; - WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory " - "'%s/%s', leaking at least '%s'\n", __func__, - de->parent->name, de->name, de->subdir->name); + WARN(de->subdir, "%s: removing non-empty directory " + "'%s/%s', leaking at least '%s'\n", __func__, + de->parent->name, de->name, de->subdir->name); pde_put(de); } EXPORT_SYMBOL(remove_proc_entry); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 439ae6886507..869116c2afbe 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -13,6 +13,7 @@ #include <linux/stat.h> #include <linux/completion.h> #include <linux/poll.h> +#include <linux/printk.h> #include <linux/file.h> #include <linux/limits.h> #include <linux/init.h> @@ -144,7 +145,7 @@ void pde_users_dec(struct proc_dir_entry *pde) static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; loff_t (*llseek)(struct file *, loff_t, int); @@ -179,7 +180,7 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); @@ -201,7 +202,7 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); @@ -223,7 +224,7 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned int rv = DEFAULT_POLLMASK; unsigned int (*poll)(struct file *, struct poll_table_struct *); @@ -245,7 +246,7 @@ static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *p static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; long (*ioctl)(struct file *, unsigned int, unsigned long); @@ -268,7 +269,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne #ifdef CONFIG_COMPAT static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; long (*compat_ioctl)(struct file *, unsigned int, unsigned long); @@ -291,7 +292,7 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; int (*mmap)(struct file *, struct vm_area_struct *); @@ -445,12 +446,10 @@ static const struct file_operations proc_reg_file_ops_no_compat = { struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { - struct inode * inode; + struct inode *inode = new_inode_pseudo(sb); - inode = iget_locked(sb, de->low_ino); - if (!inode) - return NULL; - if (inode->i_state & I_NEW) { + if (inode) { + inode->i_ino = de->low_ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->pde = de; @@ -478,14 +477,15 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_fop = de->proc_fops; } } - unlock_new_inode(inode); } else pde_put(de); return inode; -} +} int proc_fill_super(struct super_block *s) { + struct inode *root_inode; + s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; @@ -494,11 +494,17 @@ int proc_fill_super(struct super_block *s) s->s_time_gran = 1; pde_get(&proc_root); - s->s_root = d_make_root(proc_get_inode(s, &proc_root)); - if (s->s_root) - return 0; + root_inode = proc_get_inode(s, &proc_root); + if (!root_inode) { + pr_err("proc_fill_super: get root inode failed\n"); + return -ENOMEM; + } - printk("proc_read_super: get root inode failed\n"); - pde_put(&proc_root); - return -ENOMEM; + s->s_root = d_make_root(root_inode); + if (!s->s_root) { + pr_err("proc_fill_super: allocate dentry failed\n"); + return -ENOMEM; + } + + return 0; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 252544c05207..85ff3a4598b3 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -11,6 +11,7 @@ #include <linux/sched.h> #include <linux/proc_fs.h> +#include <linux/binfmts.h> struct ctl_table_header; struct mempolicy; @@ -108,7 +109,7 @@ static inline int task_dumpable(struct task_struct *task) if (mm) dumpable = get_dumpable(mm); task_unlock(task); - if (dumpable == SUID_DUMPABLE_ENABLED) + if (dumpable == SUID_DUMP_USER) return 1; return 0; } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e96d4f18ca3a..eda6f017f272 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -17,6 +17,7 @@ #include <linux/elfcore.h> #include <linux/vmalloc.h> #include <linux/highmem.h> +#include <linux/printk.h> #include <linux/bootmem.h> #include <linux/init.h> #include <linux/slab.h> @@ -619,7 +620,7 @@ static int __init proc_kcore_init(void) proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations); if (!proc_root_kcore) { - printk(KERN_ERR "couldn't create /proc/kcore\n"); + pr_err("couldn't create /proc/kcore\n"); return 0; /* Always returns 0. */ } /* Store text area if it's special */ diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 80e4645f7990..1efaaa19c4f3 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) * sysctl_overcommit_ratio / 100) + total_swap_pages; cached = global_page_state(NR_FILE_PAGES) - - total_swapcache_pages - i.bufferram; + total_swapcache_pages() - i.bufferram; if (cached < 0) cached = 0; @@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeram), K(i.bufferram), K(cached), - K(total_swapcache_pages), + K(total_swapcache_pages()), K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), K(pages[LRU_ACTIVE_ANON]), @@ -158,7 +158,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) vmi.used >> 10, vmi.largest_chunk >> 10 #ifdef CONFIG_MEMORY_FAILURE - ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) + ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index b7a47196c8c3..66b51c0383da 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -118,7 +118,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) struct super_block *sb = inode->i_sb; struct proc_inode *ei = PROC_I(inode); struct task_struct *task; - struct dentry *ns_dentry; + struct path ns_path; void *error = ERR_PTR(-EACCES); task = get_proc_task(inode); @@ -128,14 +128,14 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); - if (IS_ERR(ns_dentry)) { - error = ERR_CAST(ns_dentry); + ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); + if (IS_ERR(ns_path.dentry)) { + error = ERR_CAST(ns_path.dentry); goto out_put_task; } - dput(nd->path.dentry); - nd->path.dentry = ns_dentry; + ns_path.mnt = mntget(nd->path.mnt); + nd_jump_link(nd, &ns_path); error = NULL; out_put_task: diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index b1822dde55c2..ccfd99bd1c5a 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -45,7 +45,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) file = region->vm_file; if (file) { - struct inode *inode = region->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(region->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; } diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index de20ec480fa0..30b590f5bd35 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -8,6 +8,7 @@ #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/printk.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/of.h> @@ -110,8 +111,8 @@ void proc_device_tree_update_prop(struct proc_dir_entry *pde, if (ent->data == oldprop) break; if (ent == NULL) { - printk(KERN_WARNING "device-tree: property \"%s\" " - " does not exist\n", oldprop->name); + pr_warn("device-tree: property \"%s\" does not exist\n", + oldprop->name); } else { ent->data = newprop; ent->size = newprop->length; @@ -153,8 +154,8 @@ static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de, realloc: fixed_name = kmalloc(fixup_len, GFP_KERNEL); if (fixed_name == NULL) { - printk(KERN_ERR "device-tree: Out of memory trying to fixup " - "name \"%s\"\n", name); + pr_err("device-tree: Out of memory trying to fixup " + "name \"%s\"\n", name); return name; } @@ -175,8 +176,8 @@ retry: goto retry; } - printk(KERN_WARNING "device-tree: Duplicate name in %s, " - "renamed to \"%s\"\n", np->full_name, fixed_name); + pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n", + np->full_name, fixed_name); return fixed_name; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index fe72cd073dea..b4ac6572474f 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -163,7 +163,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent, struct net *net; ret = -EINVAL; - net = get_proc_task_net(filp->f_path.dentry->d_inode); + net = get_proc_task_net(file_inode(filp)); if (net != NULL) { ret = proc_readdir_de(net->proc_net, filp, dirent, filldir); put_net(net); @@ -177,20 +177,6 @@ const struct file_operations proc_net_operations = { .readdir = proc_tgid_net_readdir, }; - -struct proc_dir_entry *proc_net_fops_create(struct net *net, - const char *name, umode_t mode, const struct file_operations *fops) -{ - return proc_create(name, mode, net->proc_net, fops); -} -EXPORT_SYMBOL_GPL(proc_net_fops_create); - -void proc_net_remove(struct net *net, const char *name) -{ - remove_proc_entry(name, net->proc_net); -} -EXPORT_SYMBOL_GPL(proc_net_remove); - static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 1827d88ad58b..ac05f33a0dde 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -5,6 +5,7 @@ #include <linux/sysctl.h> #include <linux/poll.h> #include <linux/proc_fs.h> +#include <linux/printk.h> #include <linux/security.h> #include <linux/sched.h> #include <linux/namei.h> @@ -57,7 +58,7 @@ static void sysctl_print_dir(struct ctl_dir *dir) { if (dir->header.parent) sysctl_print_dir(dir->header.parent); - printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname); + pr_cont("%s/", dir->header.ctl_table[0].procname); } static int namecmp(const char *name1, int len1, const char *name2, int len2) @@ -134,9 +135,9 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) else if (cmp > 0) p = &(*p)->rb_right; else { - printk(KERN_ERR "sysctl duplicate entry: "); + pr_err("sysctl duplicate entry: "); sysctl_print_dir(head->parent); - printk(KERN_CONT "/%s\n", entry->procname); + pr_cont("/%s\n", entry->procname); return -EEXIST; } } @@ -478,7 +479,7 @@ out: static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, size_t count, loff_t *ppos, int write) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; ssize_t error; @@ -542,7 +543,7 @@ static int proc_sys_open(struct inode *inode, struct file *filp) static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; unsigned int ret = DEFAULT_POLLMASK; @@ -927,9 +928,9 @@ found: subdir->header.nreg++; failed: if (unlikely(IS_ERR(subdir))) { - printk(KERN_ERR "sysctl could not get directory: "); + pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); - printk(KERN_CONT "/%*.*s %ld\n", + pr_cont("/%*.*s %ld\n", namelen, namelen, name, PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); @@ -995,8 +996,8 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n", - path, table->procname, &vaf); + pr_err("sysctl table check failed: %s/%s %pV\n", + path, table->procname, &vaf); va_end(args); return -EINVAL; @@ -1510,9 +1511,9 @@ static void put_links(struct ctl_table_header *header) drop_sysctl_table(link_head); } else { - printk(KERN_ERR "sysctl link missing during unregister: "); + pr_err("sysctl link missing during unregister: "); sysctl_print_dir(parent); - printk(KERN_CONT "/%s\n", name); + pr_cont("/%s\n", name); } } } diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26bac..9c7fab1d23f0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/bitops.h> +#include <linux/user_namespace.h> #include <linux/mount.h> #include <linux/pid_namespace.h> #include <linux/parser.h> @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } else { ns = task_active_pid_ns(current); options = data; + + if (!current_user_ns()->may_mount_proc) + return ERR_PTR(-EPERM); } sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ca5ce7f9f800..3e636d864d56 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -271,7 +271,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) const char *name = NULL; if (file) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; @@ -743,7 +743,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return rv; if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) return -EINVAL; - task = get_proc_task(file->f_path.dentry->d_inode); + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mm = get_task_mm(task); @@ -1015,7 +1015,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; struct pagemapread pm; int ret = -ESRCH; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 1ccfa537f5f5..56123a6f462e 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -149,7 +149,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, file = vma->vm_file; if (file) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 0d5071d29985..b870f740ab5a 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -15,6 +15,7 @@ #include <linux/export.h> #include <linux/slab.h> #include <linux/highmem.h> +#include <linux/printk.h> #include <linux/bootmem.h> #include <linux/init.h> #include <linux/crash_dump.h> @@ -175,15 +176,15 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); if (!curr_m) return -EINVAL; - if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) - tsz = buflen; - - /* Calculate left bytes in current memory segment. */ - nr_bytes = (curr_m->size - (start - curr_m->paddr)); - if (tsz > nr_bytes) - tsz = nr_bytes; while (buflen) { + tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK)); + + /* Calculate left bytes in current memory segment. */ + nr_bytes = (curr_m->size - (start - curr_m->paddr)); + if (tsz > nr_bytes) + tsz = nr_bytes; + tmp = read_from_oldmem(buffer, tsz, &start, 1); if (tmp < 0) return tmp; @@ -198,12 +199,6 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, struct vmcore, list); start = curr_m->paddr; } - if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) - tsz = buflen; - /* Calculate left bytes in current memory segment. */ - nr_bytes = (curr_m->size - (start - curr_m->paddr)); - if (tsz > nr_bytes) - tsz = nr_bytes; } return acc; } @@ -553,8 +548,7 @@ static int __init parse_crash_elf64_headers(void) ehdr.e_ehsize != sizeof(Elf64_Ehdr) || ehdr.e_phentsize != sizeof(Elf64_Phdr) || ehdr.e_phnum == 0) { - printk(KERN_WARNING "Warning: Core image elf header is not" - "sane\n"); + pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; } @@ -609,8 +603,7 @@ static int __init parse_crash_elf32_headers(void) ehdr.e_ehsize != sizeof(Elf32_Ehdr) || ehdr.e_phentsize != sizeof(Elf32_Phdr) || ehdr.e_phnum == 0) { - printk(KERN_WARNING "Warning: Core image elf header is not" - "sane\n"); + pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; } @@ -653,8 +646,7 @@ static int __init parse_crash_elf_headers(void) if (rc < 0) return rc; if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { - printk(KERN_WARNING "Warning: Core image elf header" - " not found\n"); + pr_warn("Warning: Core image elf header not found\n"); return -EINVAL; } @@ -673,8 +665,7 @@ static int __init parse_crash_elf_headers(void) /* Determine vmcore size. */ vmcore_size = get_vmcore_size_elf32(elfcorebuf); } else { - printk(KERN_WARNING "Warning: Core image elf header is not" - " sane\n"); + pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; } return 0; @@ -690,7 +681,7 @@ static int __init vmcore_init(void) return rc; rc = parse_crash_elf_headers(); if (rc) { - printk(KERN_WARNING "Kdump: vmcore not initialized\n"); + pr_warn("Kdump: vmcore not initialized\n"); return rc; } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 67de74ca85f4..e4bcb2cf055a 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -418,9 +418,25 @@ static struct file_system_type pstore_fs_type = { .kill_sb = pstore_kill_sb, }; +static struct kobject *pstore_kobj; + static int __init init_pstore_fs(void) { - return register_filesystem(&pstore_fs_type); + int err = 0; + + /* Create a convenient mount point for people to access pstore */ + pstore_kobj = kobject_create_and_add("pstore", fs_kobj); + if (!pstore_kobj) { + err = -ENOMEM; + goto out; + } + + err = register_filesystem(&pstore_fs_type); + if (err < 0) + kobject_put(pstore_kobj); + +out: + return err; } module_init(init_pstore_fs) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 5ea2e77ff023..86d1038b5a12 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -96,6 +96,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason) } } +bool pstore_cannot_block_path(enum kmsg_dump_reason reason) +{ + /* + * In case of NMI path, pstore shouldn't be blocked + * regardless of reason. + */ + if (in_nmi()) + return true; + + switch (reason) { + /* In panic case, other cpus are stopped by smp_send_stop(). */ + case KMSG_DUMP_PANIC: + /* Emergency restart shouldn't be blocked by spin lock. */ + case KMSG_DUMP_EMERG: + return true; + default: + return false; + } +} +EXPORT_SYMBOL_GPL(pstore_cannot_block_path); + /* * callback from kmsg_dump. (s2,l2) has the most recently * written bytes, older bytes are in (s1,l1). Save as much @@ -114,10 +135,12 @@ static void pstore_dump(struct kmsg_dumper *dumper, why = get_reason_str(reason); - if (in_nmi()) { - is_locked = spin_trylock(&psinfo->buf_lock); - if (!is_locked) - pr_err("pstore dump routine blocked in NMI, may corrupt error record\n"); + if (pstore_cannot_block_path(reason)) { + is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags); + if (!is_locked) { + pr_err("pstore dump routine blocked in %s path, may corrupt error record\n" + , in_nmi() ? "NMI" : why); + } } else spin_lock_irqsave(&psinfo->buf_lock, flags); oopscount++; @@ -143,9 +166,9 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += hsize + len; part++; } - if (in_nmi()) { + if (pstore_cannot_block_path(reason)) { if (is_locked) - spin_unlock(&psinfo->buf_lock); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); } else spin_unlock_irqrestore(&psinfo->buf_lock, flags); } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 7003e5266f25..288f068740f6 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -167,12 +167,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) { char *hdr; - struct timeval timestamp; + struct timespec timestamp; size_t len; - do_gettimeofday(×tamp); + /* Report zeroed timestamp if called before timekeeping has resumed. */ + if (__getnstimeofday(×tamp)) { + timestamp.tv_sec = 0; + timestamp.tv_nsec = 0; + } hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n", - (long)timestamp.tv_sec, (long)timestamp.tv_usec); + (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000)); WARN_ON_ONCE(!hdr); len = hdr ? strlen(hdr) : 0; persistent_ram_write(prz, hdr, len); diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index 7b0329468a5d..28ce014b3cef 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -16,7 +16,7 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); unsigned int offset; struct buffer_head *bh; struct qnx4_inode_entry *de; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 43098bb5723a..2e8caa62da78 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -412,6 +412,7 @@ static struct file_system_type qnx4_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("qnx4"); static int __init init_qnx4_fs(void) { diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index dc597353db3b..8798d065e400 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -117,7 +117,7 @@ static int qnx6_dir_longfilename(struct inode *inode, static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *s = inode->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index b6addf560483..8d941edfefa1 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -285,7 +285,7 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s, if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) { /* we got a big endian fs */ QNX6DEBUG((KERN_INFO "qnx6: fs got different" - " endianess.\n")); + " endianness.\n")); return bh; } else sbi->s_bytesex = BYTESEX_LE; @@ -672,6 +672,7 @@ static struct file_system_type qnx6_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("qnx6"); static int __init init_qnx6_fs(void) { diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 05ae3c97f7a5..3e64169ef527 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1439,8 +1439,11 @@ static void __dquot_initialize(struct inode *inode, int type) * did a write before quota was turned on */ rsv = inode_get_rsv_space(inode); - if (unlikely(rsv)) + if (unlikely(rsv)) { + spin_lock(&dq_data_lock); dquot_resv_space(inode->i_dquot[cnt], rsv); + spin_unlock(&dq_data_lock); + } } } out_err: diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index d5378d028589..8d5b438cc188 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -202,7 +202,7 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long pgoff, unsigned long flags) { unsigned long maxpages, lpages, nr, loop, ret; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct page **pages = NULL, **ptr, *page; loff_t isize; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index eab8c09d3801..c24f1e10b946 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -260,6 +260,7 @@ static struct file_system_type ramfs_fs_type = { .name = "ramfs", .mount = ramfs_mount, .kill_sb = ramfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type rootfs_fs_type = { .name = "rootfs", diff --git a/fs/read_write.c b/fs/read_write.c index bb34af315280..e6ddc8dceb96 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -15,7 +15,9 @@ #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> +#include <linux/compat.h> #include "read_write.h" +#include "internal.h" #include <asm/uaccess.h> #include <asm/unistd.h> @@ -163,7 +165,7 @@ EXPORT_SYMBOL(no_llseek); loff_t default_llseek(struct file *file, loff_t offset, int whence) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); loff_t retval; mutex_lock(&inode->i_mutex); @@ -247,6 +249,13 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) return retval; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) +{ + return sys_lseek(fd, offset, whence); +} +#endif + #ifdef __ARCH_WANT_SYS_LLSEEK SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, @@ -278,7 +287,6 @@ out_putf: } #endif - /* * rw_verify_area doesn't like huge counts. We limit * them to something that fits in "int" so that others @@ -290,7 +298,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count loff_t pos; int retval = -EINVAL; - inode = file->f_path.dentry->d_inode; + inode = file_inode(file); if (unlikely((ssize_t) count < 0)) return retval; pos = *ppos; @@ -410,6 +418,33 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof EXPORT_SYMBOL(do_sync_write); +ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + const char __user *p; + ssize_t ret; + + if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) + return -EINVAL; + + old_fs = get_fs(); + set_fs(get_ds()); + p = (__force const char __user *)buf; + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + if (file->f_op->write) + ret = file->f_op->write(file, p, count, pos); + else + ret = do_sync_write(file, p, count, pos); + set_fs(old_fs); + if (ret > 0) { + fsnotify_modify(file); + add_wchar(current, ret); + } + inc_syscw(current); + return ret; +} + ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; @@ -901,8 +936,8 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, if (!(out.file->f_mode & FMODE_WRITE)) goto fput_out; retval = -EINVAL; - in_inode = in.file->f_path.dentry->d_inode; - out_inode = out.file->f_path.dentry->d_inode; + in_inode = file_inode(in.file); + out_inode = file_inode(out.file); retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count); if (retval < 0) goto fput_out; diff --git a/fs/readdir.c b/fs/readdir.c index 5e69ef533b77..fee38e04fae4 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -22,7 +22,7 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); int res = -ENOTDIR; if (!file->f_op || !file->f_op->readdir) goto out; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 50302d6f8895..6165bd4784f6 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -268,7 +268,7 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t * new current position before returning. */ ) { - struct inode *inode = file->f_path.dentry->d_inode; // Inode of the file that we are writing to. + struct inode *inode = file_inode(file); // Inode of the file that we are writing to. /* To simplify coding at this time, we store locked pages in array for now */ struct reiserfs_transaction_handle th; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 95d7680ead47..ea5061fd4f3e 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1603,10 +1603,10 @@ int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp, if (parent && (maxlen < 5)) { *lenp = 5; - return 255; + return FILEID_INVALID; } else if (maxlen < 3) { *lenp = 3; - return 255; + return FILEID_INVALID; } data[0] = inode->i_ino; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 0c2185042d5f..15cb5fe6b425 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -21,7 +21,7 @@ */ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); unsigned int flags; int err = 0; diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index e60e87035bb3..9cc0740adffa 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -281,7 +281,7 @@ static int show_oidmap(struct seq_file *m, struct super_block *sb) } #if defined( REISERFS_USE_OIDMAPF ) if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) { - loff_t size = sb_info->oidmap.mapf->f_path.dentry->d_inode->i_size; + loff_t size = file_inode(sb_info->oidmap.mapf)->i_size; total_used += size / sizeof(reiserfs_oidinterval_d_t); } #endif diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 418bdc3a57da..f8a23c3078f8 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1147,8 +1147,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin "on filesystem root."); return 0; } - qf_names[qtype] = - kmalloc(strlen(arg) + 1, GFP_KERNEL); + qf_names[qtype] = kstrdup(arg, GFP_KERNEL); if (!qf_names[qtype]) { reiserfs_warning(s, "reiserfs-2502", "not enough memory " @@ -1156,7 +1155,6 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin "quotafile name."); return 0; } - strcpy(qf_names[qtype], arg); if (qtype == USRQUOTA) *mount_options |= 1 << REISERFS_USRQUOTA; else @@ -2434,6 +2432,7 @@ struct file_system_type reiserfs_fs_type = { .kill_sb = reiserfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("reiserfs"); MODULE_DESCRIPTION("ReiserFS journaled filesystem"); MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>"); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index fd7c5f60b46b..15cbc41ee365 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -147,7 +147,7 @@ static const struct address_space_operations romfs_aops = { */ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *i = filp->f_dentry->d_inode; + struct inode *i = file_inode(filp); struct romfs_inode ri; unsigned long offset, maxoff; int j, ino, nextfh; @@ -599,6 +599,7 @@ static struct file_system_type romfs_fs_type = { .kill_sb = romfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("romfs"); /* * inode storage initialiser diff --git a/fs/select.c b/fs/select.c index 2ef72d965036..8c1c96c27062 100644 --- a/fs/select.c +++ b/fs/select.c @@ -26,6 +26,7 @@ #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> +#include <linux/sched/rt.h> #include <asm/uaccess.h> diff --git a/fs/seq_file.c b/fs/seq_file.c index f2bc3dfd0b88..38bb59f3f2ad 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -308,27 +308,27 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) mutex_lock(&m->lock); m->version = file->f_version; switch (whence) { - case 1: - offset += file->f_pos; - case 0: - if (offset < 0) - break; - retval = offset; - if (offset != m->read_pos) { - while ((retval=traverse(m, offset)) == -EAGAIN) - ; - if (retval) { - /* with extreme prejudice... */ - file->f_pos = 0; - m->read_pos = 0; - m->version = 0; - m->index = 0; - m->count = 0; - } else { - m->read_pos = offset; - retval = file->f_pos = offset; - } + case SEEK_CUR: + offset += file->f_pos; + case SEEK_SET: + if (offset < 0) + break; + retval = offset; + if (offset != m->read_pos) { + while ((retval = traverse(m, offset)) == -EAGAIN) + ; + if (retval) { + /* with extreme prejudice... */ + file->f_pos = 0; + m->read_pos = 0; + m->version = 0; + m->index = 0; + m->count = 0; + } else { + m->read_pos = offset; + retval = file->f_pos = offset; } + } } file->f_version = m->version; mutex_unlock(&m->lock); @@ -339,7 +339,7 @@ EXPORT_SYMBOL(seq_lseek); /** * seq_release - free the structures associated with sequential file. * @file: file in question - * @inode: file->f_path.dentry->d_inode + * @inode: its inode * * Frees the structures associated with sequential file; can be used * as ->f_op->release() if you don't have private data to destroy. diff --git a/fs/splice.c b/fs/splice.c index 6909d89d0da5..29e394e49ddd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -31,6 +31,7 @@ #include <linux/security.h> #include <linux/gfp.h> #include <linux/socket.h> +#include "internal.h" /* * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -569,7 +570,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec, return res; } -static ssize_t kernel_write(struct file *file, const char *buf, size_t count, +ssize_t kernel_write(struct file *file, const char *buf, size_t count, loff_t pos) { mm_segment_t old_fs; @@ -578,11 +579,12 @@ static ssize_t kernel_write(struct file *file, const char *buf, size_t count, old_fs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ - res = vfs_write(file, (const char __user *)buf, count, &pos); + res = vfs_write(file, (__force const char __user *)buf, count, &pos); set_fs(old_fs); return res; } +EXPORT_SYMBOL(kernel_write); ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, @@ -1047,9 +1049,10 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, { int ret; void *data; + loff_t tmp = sd->pos; data = buf->ops->map(pipe, buf, 0); - ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); + ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); buf->ops->unmap(pipe, buf, data); return ret; @@ -1170,7 +1173,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, * randomly drop data for eg socket -> socket splicing. Use the * piped splicing for that! */ - i_mode = in->f_path.dentry->d_inode->i_mode; + i_mode = file_inode(in)->i_mode; if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) return -EINVAL; diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c index b381305c9a47..57dc70ebbb19 100644 --- a/fs/squashfs/dir.c +++ b/fs/squashfs/dir.c @@ -102,7 +102,7 @@ static int get_dir_index_using_offset(struct super_block *sb, static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; u64 block = squashfs_i(inode)->start + msblk->directory_table; int offset = squashfs_i(inode)->offset, length, dir_count, size, diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 260e3928d4f5..60553a9053ca 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -489,6 +489,7 @@ static struct file_system_type squashfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV }; +MODULE_ALIAS_FS("squashfs"); static const struct super_operations squashfs_super_ops = { .alloc_inode = squashfs_alloc_inode, diff --git a/fs/stat.c b/fs/stat.c index 14f45459c83d..04ce1ac20d20 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -37,17 +37,17 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) EXPORT_SYMBOL(generic_fillattr); -int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int vfs_getattr(struct path *path, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = path->dentry->d_inode; int retval; - retval = security_inode_getattr(mnt, dentry); + retval = security_inode_getattr(path->mnt, path->dentry); if (retval) return retval; if (inode->i_op->getattr) - return inode->i_op->getattr(mnt, dentry, stat); + return inode->i_op->getattr(path->mnt, path->dentry, stat); generic_fillattr(inode, stat); return 0; @@ -61,8 +61,7 @@ int vfs_fstat(unsigned int fd, struct kstat *stat) int error = -EBADF; if (f.file) { - error = vfs_getattr(f.file->f_path.mnt, f.file->f_path.dentry, - stat); + error = vfs_getattr(&f.file->f_path, stat); fdput(f); } return error; @@ -89,7 +88,7 @@ retry: if (error) goto out; - error = vfs_getattr(path.mnt, path.dentry, stat); + error = vfs_getattr(&path, stat); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; diff --git a/fs/super.c b/fs/super.c index 12f123712161..7465d4364208 100644 --- a/fs/super.c +++ b/fs/super.c @@ -447,14 +447,13 @@ struct super_block *sget(struct file_system_type *type, void *data) { struct super_block *s = NULL; - struct hlist_node *node; struct super_block *old; int err; retry: spin_lock(&sb_lock); if (test) { - hlist_for_each_entry(old, node, &type->fs_supers, s_instances) { + hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (!grab_super(old)) @@ -554,10 +553,9 @@ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; - struct hlist_node *node; spin_lock(&sb_lock); - hlist_for_each_entry(sb, node, &type->fs_supers, s_instances) { + hlist_for_each_entry(sb, &type->fs_supers, s_instances) { sb->s_count++; spin_unlock(&sb_lock); @@ -842,7 +840,7 @@ int get_anon_bdev(dev_t *p) else if (error) return -EAGAIN; - if ((dev & MAX_IDR_MASK) == (1 << MINORBITS)) { + if (dev == (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) diff --git a/fs/sync.c b/fs/sync.c index 14eefeb44636..2c5d6639a66a 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -332,7 +332,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, if (!f.file) goto out; - i_mode = f.file->f_path.dentry->d_inode->i_mode; + i_mode = file_inode(f.file)->i_mode; ret = -ESPIPE; if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && !S_ISLNK(i_mode)) diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 614b2b544880..15c68f9489ae 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -70,7 +70,7 @@ static ssize_t read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) { struct bin_buffer *bb = file->private_data; - int size = file->f_path.dentry->d_inode->i_size; + int size = file_inode(file)->i_size; loff_t offs = *off; int count = min_t(size_t, bytes, PAGE_SIZE); char *temp; @@ -140,7 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf, size_t bytes, loff_t *off) { struct bin_buffer *bb = file->private_data; - int size = file->f_path.dentry->d_inode->i_size; + int size = file_inode(file)->i_size; loff_t offs = *off; int count = min_t(size_t, bytes, PAGE_SIZE); char *temp; @@ -461,15 +461,14 @@ const struct file_operations bin_fops = { void unmap_bin_file(struct sysfs_dirent *attr_sd) { struct bin_buffer *bb; - struct hlist_node *tmp; if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR) return; mutex_lock(&sysfs_bin_lock); - hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) { - struct inode *inode = bb->file->f_path.dentry->d_inode; + hlist_for_each_entry(bb, &attr_sd->s_bin_attr.buffers, list) { + struct inode *inode = file_inode(bb->file); unmap_mapping_range(inode->i_mapping, 0, 0, 1); } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2fbdff6be25c..e14512678c9b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1020,6 +1020,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } if (filp->f_pos == 1) { if (parent_sd->s_parent) @@ -1028,6 +1030,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } mutex_lock(&sysfs_mutex); for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); @@ -1058,10 +1062,21 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) return 0; } +static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} const struct file_operations sysfs_dir_operations = { .read = generic_read_dir, .readdir = sysfs_readdir, .release = sysfs_dir_release, - .llseek = generic_file_llseek, + .llseek = sysfs_dir_llseek, }; diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 2df555c66d57..aec3d5c98c94 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -205,6 +205,48 @@ void sysfs_unmerge_group(struct kobject *kobj, } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); +/** + * sysfs_add_link_to_group - add a symlink to an attribute group. + * @kobj: The kobject containing the group. + * @group_name: The name of the group. + * @target: The target kobject of the symlink to create. + * @link_name: The name of the symlink to create. + */ +int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, + struct kobject *target, const char *link_name) +{ + struct sysfs_dirent *dir_sd; + int error = 0; + + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); + if (!dir_sd) + return -ENOENT; + + error = sysfs_create_link_sd(dir_sd, target, link_name); + sysfs_put(dir_sd); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); + +/** + * sysfs_remove_link_from_group - remove a symlink from an attribute group. + * @kobj: The kobject containing the group. + * @group_name: The name of the group. + * @link_name: The name of the symlink to remove. + */ +void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, + const char *link_name) +{ + struct sysfs_dirent *dir_sd; + + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); + if (dir_sd) { + sysfs_hash_and_remove(dir_sd, NULL, link_name); + sysfs_put(dir_sd); + } +} +EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); EXPORT_SYMBOL_GPL(sysfs_create_group); EXPORT_SYMBOL_GPL(sysfs_update_group); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index db940a9be045..afd83273e6ce 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -10,7 +10,7 @@ * Please see Documentation/filesystems/sysfs.txt for more information. */ -#define DEBUG +#define DEBUG #include <linux/fs.h> #include <linux/mount.h> @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/user_namespace.h> #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + return ERR_PTR(-EPERM); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 3c9eb5624f5e..8c940df97a52 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -21,26 +21,17 @@ #include "sysfs.h" -static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, - const char *name, int warn) +static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, + struct kobject *target, + const char *name, int warn) { - struct sysfs_dirent *parent_sd = NULL; struct sysfs_dirent *target_sd = NULL; struct sysfs_dirent *sd = NULL; struct sysfs_addrm_cxt acxt; enum kobj_ns_type ns_type; int error; - BUG_ON(!name); - - if (!kobj) - parent_sd = &sysfs_root; - else - parent_sd = kobj->sd; - - error = -EFAULT; - if (!parent_sd) - goto out_put; + BUG_ON(!name || !parent_sd); /* target->sd can go away beneath us but is protected with * sysfs_assoc_lock. Fetch target_sd from it. @@ -96,6 +87,34 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, } /** + * sysfs_create_link_sd - create symlink to a given object. + * @sd: directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + */ +int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link_sd(sd, target, name, 1); +} + +static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, + const char *name, int warn) +{ + struct sysfs_dirent *parent_sd = NULL; + + if (!kobj) + parent_sd = &sysfs_root; + else + parent_sd = kobj->sd; + + if (!parent_sd) + return -EFAULT; + + return sysfs_do_create_link_sd(parent_sd, target, name, warn); +} + +/** * sysfs_create_link - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index d73c0932bbd6..d1e4043eb0c3 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -240,3 +240,5 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd); * symlink.c */ extern const struct inode_operations sysfs_symlink_inode_operations; +int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, + const char *name); diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index a77c42157620..3799e8dac3eb 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -68,7 +68,7 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n) static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) { unsigned long pos = filp->f_pos; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; unsigned offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/sysv/super.c b/fs/sysv/super.c index a38e87bdd78d..d0c6a007ce83 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -545,6 +545,7 @@ static struct file_system_type sysv_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("sysv"); static struct file_system_type v7_fs_type = { .owner = THIS_MODULE, @@ -553,6 +554,8 @@ static struct file_system_type v7_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("v7"); +MODULE_ALIAS("v7"); static int __init init_sysv_fs(void) { @@ -586,5 +589,4 @@ static void __exit exit_sysv_fs(void) module_init(init_sysv_fs) module_exit(exit_sysv_fs) -MODULE_ALIAS("v7"); MODULE_LICENSE("GPL"); diff --git a/fs/timerfd.c b/fs/timerfd.c index d03822bbf190..32b644f03690 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -22,6 +22,7 @@ #include <linux/anon_inodes.h> #include <linux/timerfd.h> #include <linux/syscalls.h> +#include <linux/compat.h> #include <linux/rcupdate.h> struct timerfd_ctx { @@ -278,21 +279,17 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) return ufd; } -SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, - const struct itimerspec __user *, utmr, - struct itimerspec __user *, otmr) +static int do_timerfd_settime(int ufd, int flags, + const struct itimerspec *new, + struct itimerspec *old) { struct fd f; struct timerfd_ctx *ctx; - struct itimerspec ktmr, kotmr; int ret; - if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) - return -EFAULT; - if ((flags & ~TFD_SETTIME_FLAGS) || - !timespec_valid(&ktmr.it_value) || - !timespec_valid(&ktmr.it_interval)) + !timespec_valid(&new->it_value) || + !timespec_valid(&new->it_interval)) return -EINVAL; ret = timerfd_fget(ufd, &f); @@ -323,27 +320,23 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, if (ctx->expired && ctx->tintv.tv64) hrtimer_forward_now(&ctx->tmr, ctx->tintv); - kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); - kotmr.it_interval = ktime_to_timespec(ctx->tintv); + old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + old->it_interval = ktime_to_timespec(ctx->tintv); /* * Re-program the timer to the new value ... */ - ret = timerfd_setup(ctx, flags, &ktmr); + ret = timerfd_setup(ctx, flags, new); spin_unlock_irq(&ctx->wqh.lock); fdput(f); - if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr))) - return -EFAULT; - return ret; } -SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) +static int do_timerfd_gettime(int ufd, struct itimerspec *t) { struct fd f; struct timerfd_ctx *ctx; - struct itimerspec kotmr; int ret = timerfd_fget(ufd, &f); if (ret) return ret; @@ -356,11 +349,65 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1; hrtimer_restart(&ctx->tmr); } - kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); - kotmr.it_interval = ktime_to_timespec(ctx->tintv); + t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + t->it_interval = ktime_to_timespec(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); fdput(f); + return 0; +} +SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, + const struct itimerspec __user *, utmr, + struct itimerspec __user *, otmr) +{ + struct itimerspec new, old; + int ret; + + if (copy_from_user(&new, utmr, sizeof(new))) + return -EFAULT; + ret = do_timerfd_settime(ufd, flags, &new, &old); + if (ret) + return ret; + if (otmr && copy_to_user(otmr, &old, sizeof(old))) + return -EFAULT; + + return ret; +} + +SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) +{ + struct itimerspec kotmr; + int ret = do_timerfd_gettime(ufd, &kotmr); + if (ret) + return ret; return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, + const struct compat_itimerspec __user *, utmr, + struct compat_itimerspec __user *, otmr) +{ + struct itimerspec new, old; + int ret; + + if (get_compat_itimerspec(&new, utmr)) + return -EFAULT; + ret = do_timerfd_settime(ufd, flags, &new, &old); + if (ret) + return ret; + if (otmr && put_compat_itimerspec(otmr, &old)) + return -EFAULT; + return ret; +} + +COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd, + struct compat_itimerspec __user *, otmr) +{ + struct itimerspec kotmr; + int ret = do_timerfd_gettime(ufd, &kotmr); + if (ret) + return ret; + return put_compat_itimerspec(otmr, &kotmr) ? -EFAULT: 0; +} +#endif diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 12817ffc7345..7f60e900edff 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2459,7 +2459,7 @@ error_dump: static inline int chance(unsigned int n, unsigned int out_of) { - return !!((random32() % out_of) + 1 <= n); + return !!((prandom_u32() % out_of) + 1 <= n); } @@ -2477,13 +2477,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) if (chance(1, 2)) { d->pc_delay = 1; /* Fail withing 1 minute */ - delay = random32() % 60000; + delay = prandom_u32() % 60000; d->pc_timeout = jiffies; d->pc_timeout += msecs_to_jiffies(delay); ubifs_warn("failing after %lums", delay); } else { d->pc_delay = 2; - delay = random32() % 10000; + delay = prandom_u32() % 10000; /* Fail within 10000 operations */ d->pc_cnt_max = delay; ubifs_warn("failing after %lu calls", delay); @@ -2563,7 +2563,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; - from = random32() % (len + 1); + from = prandom_u32() % (len + 1); /* Corruption may only span one max. write unit */ to = min(len, ALIGN(from, c->max_write_size)); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 8a574776a493..de08c92f2e23 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -352,7 +352,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) struct qstr nm; union ubifs_key key; struct ubifs_dent_node *dent; - struct inode *dir = file->f_path.dentry->d_inode; + struct inode *dir = file_inode(file); struct ubifs_info *c = dir->i_sb->s_fs_info; dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5bc77817f382..f12189d2db1d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1444,7 +1444,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); struct ubifs_info *c = inode->i_sb->s_fs_info; struct timespec now = ubifs_current_time(inode); struct ubifs_budget_req req = { .new_page = 1 }; @@ -1522,6 +1522,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, ubifs_release_dirty_inode_budget(c, ui); } + wait_for_stable_page(page); unlock_page(page); return 0; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 1a7e2d8bdbe9..648b143606cc 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -147,7 +147,7 @@ out_unlock: long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int flags, err; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); switch (cmd) { case FS_IOC_GETFLAGS: diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 9daaeef675dd..4b826abb1528 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -2007,28 +2007,28 @@ static int dbg_populate_lsave(struct ubifs_info *c) if (!dbg_is_chk_gen(c)) return 0; - if (random32() & 3) + if (prandom_u32() & 3) return 0; for (i = 0; i < c->lsave_cnt; i++) c->lsave[i] = c->main_first; list_for_each_entry(lprops, &c->empty_list, list) - c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; list_for_each_entry(lprops, &c->freeable_list, list) - c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; list_for_each_entry(lprops, &c->frdi_idx_list, list) - c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_DIRTY - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_FREE - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; return 1; } diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 769701ccb5c9..ba32da3fe08a 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -126,13 +126,14 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) else if (inum > o->inum) p = p->rb_right; else { - if (o->dnext) { + if (o->del) { spin_unlock(&c->orphan_lock); dbg_gen("deleted twice ino %lu", (unsigned long)inum); return; } - if (o->cnext) { + if (o->cmt) { + o->del = 1; o->dnext = c->orph_dnext; c->orph_dnext = o; spin_unlock(&c->orphan_lock); @@ -172,7 +173,9 @@ int ubifs_orphan_start_commit(struct ubifs_info *c) last = &c->orph_cnext; list_for_each_entry(orphan, &c->orph_new, new_list) { ubifs_assert(orphan->new); + ubifs_assert(!orphan->cmt); orphan->new = 0; + orphan->cmt = 1; *last = orphan; last = &orphan->cnext; } @@ -299,7 +302,9 @@ static int write_orph_node(struct ubifs_info *c, int atomic) cnext = c->orph_cnext; for (i = 0; i < cnt; i++) { orphan = cnext; + ubifs_assert(orphan->cmt); orph->inos[i] = cpu_to_le64(orphan->inum); + orphan->cmt = 0; cnext = orphan->cnext; orphan->cnext = NULL; } @@ -378,6 +383,7 @@ static int consolidate(struct ubifs_info *c) list_for_each_entry(orphan, &c->orph_list, list) { if (orphan->new) continue; + orphan->cmt = 1; *last = orphan; last = &orphan->cnext; cnt += 1; @@ -442,6 +448,7 @@ static void erase_deleted(struct ubifs_info *c) orphan = dnext; dnext = orphan->dnext; ubifs_assert(!orphan->new); + ubifs_assert(orphan->del); rb_erase(&orphan->rb, &c->orph_tree); list_del(&orphan->list); c->tot_orphans -= 1; @@ -531,6 +538,7 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum) rb_link_node(&orphan->rb, parent, p); rb_insert_color(&orphan->rb, &c->orph_tree); list_add_tail(&orphan->list, &c->orph_list); + orphan->del = 1; orphan->dnext = c->orph_dnext; c->orph_dnext = orphan; dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum, diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ddc0f6ae65e9..ac838b844936 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2174,6 +2174,7 @@ static struct file_system_type ubifs_fs_type = { .mount = ubifs_mount, .kill_sb = kill_ubifs_super, }; +MODULE_ALIAS_FS("ubifs"); /* * Inode slab cache constructor. diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 523bbad69c0c..52a6559275c4 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -683,7 +683,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt) c->ilebs[c->ileb_cnt++] = lnum; dbg_cmt("LEB %d", lnum); } - if (dbg_is_chk_index(c) && !(random32() & 7)) + if (dbg_is_chk_index(c) && !(prandom_u32() & 7)) return -ENOSPC; return 0; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index d133c276fe05..b2babce4d70f 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -904,6 +904,8 @@ struct ubifs_budget_req { * @dnext: next orphan to delete * @inum: inode number * @new: %1 => added since the last commit, otherwise %0 + * @cmt: %1 => commit pending, otherwise %0 + * @del: %1 => delete pending, otherwise %0 */ struct ubifs_orphan { struct rb_node rb; @@ -912,7 +914,9 @@ struct ubifs_orphan { struct ubifs_orphan *cnext; struct ubifs_orphan *dnext; ino_t inum; - int new; + unsigned new:1; + unsigned cmt:1; + unsigned del:1; }; /** diff --git a/fs/udf/dir.c b/fs/udf/dir.c index eb8bfe2b89a5..b3e93f5e17c3 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -186,7 +186,7 @@ out: static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *dir = filp->f_path.dentry->d_inode; + struct inode *dir = file_inode(filp); int result; if (filp->f_pos == 0) { diff --git a/fs/udf/file.c b/fs/udf/file.c index 77b5953eaac8..29569dd08168 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -139,7 +139,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, { ssize_t retval; struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); int err, pos; size_t count = iocb->ki_left; struct udf_inode_info *iinfo = UDF_I(inode); @@ -178,7 +178,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); long old_block, new_block; int result = -EINVAL; @@ -204,7 +204,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) goto out; case UDF_RELOCATE_BLOCKS: if (!capable(CAP_SYS_ADMIN)) { - result = -EACCES; + result = -EPERM; goto out; } if (get_user(old_block, (long __user *)arg)) { diff --git a/fs/udf/inode.c b/fs/udf/inode.c index cbae1ed0b7c1..7a12e48ad819 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -67,6 +67,74 @@ static void udf_update_extents(struct inode *, struct extent_position *); static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); +static void __udf_clear_extent_cache(struct inode *inode) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->cached_extent.lstart != -1) { + brelse(iinfo->cached_extent.epos.bh); + iinfo->cached_extent.lstart = -1; + } +} + +/* Invalidate extent cache */ +static void udf_clear_extent_cache(struct inode *inode) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + + spin_lock(&iinfo->i_extent_cache_lock); + __udf_clear_extent_cache(inode); + spin_unlock(&iinfo->i_extent_cache_lock); +} + +/* Return contents of extent cache */ +static int udf_read_extent_cache(struct inode *inode, loff_t bcount, + loff_t *lbcount, struct extent_position *pos) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + int ret = 0; + + spin_lock(&iinfo->i_extent_cache_lock); + if ((iinfo->cached_extent.lstart <= bcount) && + (iinfo->cached_extent.lstart != -1)) { + /* Cache hit */ + *lbcount = iinfo->cached_extent.lstart; + memcpy(pos, &iinfo->cached_extent.epos, + sizeof(struct extent_position)); + if (pos->bh) + get_bh(pos->bh); + ret = 1; + } + spin_unlock(&iinfo->i_extent_cache_lock); + return ret; +} + +/* Add extent to extent cache */ +static void udf_update_extent_cache(struct inode *inode, loff_t estart, + struct extent_position *pos, int next_epos) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + + spin_lock(&iinfo->i_extent_cache_lock); + /* Invalidate previously cached extent */ + __udf_clear_extent_cache(inode); + if (pos->bh) + get_bh(pos->bh); + memcpy(&iinfo->cached_extent.epos, pos, + sizeof(struct extent_position)); + iinfo->cached_extent.lstart = estart; + if (next_epos) + switch (iinfo->i_alloc_type) { + case ICBTAG_FLAG_AD_SHORT: + iinfo->cached_extent.epos.offset -= + sizeof(struct short_ad); + break; + case ICBTAG_FLAG_AD_LONG: + iinfo->cached_extent.epos.offset -= + sizeof(struct long_ad); + } + spin_unlock(&iinfo->i_extent_cache_lock); +} void udf_evict_inode(struct inode *inode) { @@ -90,6 +158,7 @@ void udf_evict_inode(struct inode *inode) } kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; + udf_clear_extent_cache(inode); if (want_delete) { udf_free_inode(inode); } @@ -105,6 +174,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) truncate_pagecache(inode, to, isize); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); + udf_clear_extent_cache(inode); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); } @@ -372,7 +442,7 @@ static int udf_get_block(struct inode *inode, sector_t block, iinfo->i_next_alloc_goal++; } - + udf_clear_extent_cache(inode); phys = inode_getblk(inode, block, &err, &new); if (!phys) goto abort; @@ -1171,6 +1241,7 @@ set_size: } else { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); + udf_clear_extent_cache(inode); memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize, 0x00, bsize - newsize - udf_file_entry_alloc_offset(inode)); @@ -1184,6 +1255,7 @@ set_size: if (err) return err; down_write(&iinfo->i_data_sem); + udf_clear_extent_cache(inode); truncate_setsize(inode, newsize); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); @@ -2156,11 +2228,12 @@ int8_t inode_bmap(struct inode *inode, sector_t block, struct udf_inode_info *iinfo; iinfo = UDF_I(inode); - pos->offset = 0; - pos->block = iinfo->i_location; - pos->bh = NULL; + if (!udf_read_extent_cache(inode, bcount, &lbcount, pos)) { + pos->offset = 0; + pos->block = iinfo->i_location; + pos->bh = NULL; + } *elen = 0; - do { etype = udf_next_aext(inode, pos, eloc, elen, 1); if (etype == -1) { @@ -2170,7 +2243,8 @@ int8_t inode_bmap(struct inode *inode, sector_t block, } lbcount += *elen; } while (lbcount <= bcount); - + /* update extent cache */ + udf_update_extent_cache(inode, lbcount - *elen, pos, 1); *offset = (bcount + *elen - lbcount) >> blocksize_bits; return etype; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 95fee278ab9d..102c072c6bbf 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1270,10 +1270,10 @@ static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp, if (parent && (len < 5)) { *lenp = 5; - return 255; + return FILEID_INVALID; } else if (len < 3) { *lenp = 3; - return 255; + return FILEID_INVALID; } *lenp = 3; diff --git a/fs/udf/super.c b/fs/udf/super.c index e9be396a558d..9ac4057a86c9 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -118,6 +118,7 @@ static struct file_system_type udf_fstype = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("udf"); static struct kmem_cache *udf_inode_cachep; @@ -134,6 +135,8 @@ static struct inode *udf_alloc_inode(struct super_block *sb) ei->i_next_alloc_goal = 0; ei->i_strat4096 = 0; init_rwsem(&ei->i_data_sem); + ei->cached_extent.lstart = -1; + spin_lock_init(&ei->i_extent_cache_lock); return &ei->vfs_inode; } @@ -1021,7 +1024,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) if (bitmap == NULL) return NULL; - bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1); bitmap->s_nr_groups = nr_groups; return bitmap; } @@ -1079,8 +1081,6 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (!bitmap) return 1; map->s_uspace.s_bitmap = bitmap; - bitmap->s_extLength = le32_to_cpu( - phd->unallocSpaceBitmap.extLength); bitmap->s_extPosition = le32_to_cpu( phd->unallocSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; @@ -1115,8 +1115,6 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (!bitmap) return 1; map->s_fspace.s_bitmap = bitmap; - bitmap->s_extLength = le32_to_cpu( - phd->freedSpaceBitmap.extLength); bitmap->s_extPosition = le32_to_cpu( phd->freedSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; @@ -1866,6 +1864,8 @@ static void udf_open_lvid(struct super_block *sb) mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); + /* Make opening of filesystem visible on the media immediately */ + sync_dirty_buffer(bh); } static void udf_close_lvid(struct super_block *sb) @@ -1906,6 +1906,8 @@ static void udf_close_lvid(struct super_block *sb) mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); + /* Make closing of filesystem visible on the media immediately */ + sync_dirty_buffer(bh); } u64 lvid_get_unique_id(struct super_block *sb) diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index bb8309dcd5c1..b5cd8ed2aa12 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -1,6 +1,19 @@ #ifndef _UDF_I_H #define _UDF_I_H +struct extent_position { + struct buffer_head *bh; + uint32_t offset; + struct kernel_lb_addr block; +}; + +struct udf_ext_cache { + /* Extent position */ + struct extent_position epos; + /* Start logical offset in bytes */ + loff_t lstart; +}; + /* * The i_data_sem and i_mutex serve for protection of allocation information * of a regular files and symlinks. This includes all extents belonging to @@ -35,6 +48,9 @@ struct udf_inode_info { __u8 *i_data; } i_ext; struct rw_semaphore i_data_sem; + struct udf_ext_cache cached_extent; + /* Spinlock for protecting extent cache */ + spinlock_t i_extent_cache_lock; struct inode vfs_inode; }; diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 5f027227f085..ed401e94aa8c 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -80,10 +80,9 @@ struct udf_virtual_data { }; struct udf_bitmap { - __u32 s_extLength; __u32 s_extPosition; - __u16 s_nr_groups; - struct buffer_head **s_block_bitmap; + int s_nr_groups; + struct buffer_head *s_block_bitmap[0]; }; struct udf_part_map { diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index de038da6f6bd..be7dabbbcb49 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -113,11 +113,6 @@ struct ustr { uint8_t u_len; }; -struct extent_position { - struct buffer_head *bh; - uint32_t offset; - struct kernel_lb_addr block; -}; /* super.c */ diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index e4f10a40768a..0bf6e16f8d79 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig @@ -29,7 +29,7 @@ config UFS_FS config UFS_FS_WRITE bool "UFS file system write support (DANGEROUS)" - depends on UFS_FS && EXPERIMENTAL + depends on UFS_FS help Say Y here if you want to try writing to UFS partitions. This is experimental, so you should back up your UFS partitions beforehand. diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index dbc90994715a..3a75ca09c506 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -433,7 +433,7 @@ static int ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) { loff_t pos = filp->f_pos; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index dc8e3a861d0f..329f2f53b7ed 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1500,6 +1500,7 @@ static struct file_system_type ufs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ufs"); static int __init init_ufs_fs(void) { diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 5a7ffe54f5d5..cc33aaf219f1 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -70,8 +70,8 @@ config XFS_RT If unsure, say N. config XFS_DEBUG - bool "XFS Debugging support (EXPERIMENTAL)" - depends on XFS_FS && EXPERIMENTAL + bool "XFS Debugging support" + depends on XFS_FS help Say Y here to get an XFS build with many debugging features, including ASSERT checks, function wrappers around macros, diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 393055fe3aef..0ad23253e8b1 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1925,8 +1925,6 @@ xfs_alloc_fix_freelist( targs.mp = mp; targs.agbp = agbp; targs.agno = args->agno; - targs.mod = targs.minleft = targs.wasdel = targs.userdata = - targs.minalignslop = 0; targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index aaf472532b3c..888683844d98 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -300,9 +300,12 @@ xfs_attr_set_int( if (rsvd) args.trans->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(args.trans, args.total, - XFS_ATTRSET_LOG_RES(mp, args.total), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) { + error = xfs_trans_reserve(args.trans, args.total, + XFS_ATTRSETM_LOG_RES(mp) + + XFS_ATTRSETRT_LOG_RES(mp) * args.total, + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ATTRSET_LOG_COUNT); + if (error) { xfs_trans_cancel(args.trans, 0); return(error); } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index cdb2d3348583..b44af9211bd9 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -147,7 +147,10 @@ xfs_bmap_local_to_extents( xfs_fsblock_t *firstblock, /* first block allocated in xaction */ xfs_extlen_t total, /* total blocks needed by transaction */ int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ + int whichfork, /* data or attr fork */ + void (*init_fn)(struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)); /* * Search the extents list for the inode, for the extent containing bno. @@ -357,7 +360,42 @@ xfs_bmap_add_attrfork_extents( } /* - * Called from xfs_bmap_add_attrfork to handle local format files. + * Block initialisation functions for local to extent format conversion. + * As these get more complex, they will be moved to the relevant files, + * but for now they are too simple to worry about. + */ +STATIC void +xfs_bmap_local_to_extents_init_fn( + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + bp->b_ops = &xfs_bmbt_buf_ops; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); +} + +STATIC void +xfs_symlink_local_to_remote( + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + /* remote symlink blocks are not verifiable until CRCs come along */ + bp->b_ops = NULL; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); +} + +/* + * Called from xfs_bmap_add_attrfork to handle local format files. Each + * different data fork content type needs a different callout to do the + * conversion. Some are basic and only require special block initialisation + * callouts for the data formating, others (directories) are so specialised they + * handle everything themselves. + * + * XXX (dgc): investigate whether directory conversion can use the generic + * formatting callout. It should be possible - it's just a very complex + * formatter. it would also require passing the transaction through to the init + * function. */ STATIC int /* error */ xfs_bmap_add_attrfork_local( @@ -368,25 +406,29 @@ xfs_bmap_add_attrfork_local( int *flags) /* inode logging flags */ { xfs_da_args_t dargs; /* args for dir/attr code */ - int error; /* error return value */ - xfs_mount_t *mp; /* mount structure pointer */ if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) return 0; + if (S_ISDIR(ip->i_d.di_mode)) { - mp = ip->i_mount; memset(&dargs, 0, sizeof(dargs)); dargs.dp = ip; dargs.firstblock = firstblock; dargs.flist = flist; - dargs.total = mp->m_dirblkfsbs; + dargs.total = ip->i_mount->m_dirblkfsbs; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; - error = xfs_dir2_sf_to_block(&dargs); - } else - error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, - XFS_DATA_FORK); - return error; + return xfs_dir2_sf_to_block(&dargs); + } + + if (S_ISLNK(ip->i_d.di_mode)) + return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, + flags, XFS_DATA_FORK, + xfs_symlink_local_to_remote); + + return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, + XFS_DATA_FORK, + xfs_bmap_local_to_extents_init_fn); } /* @@ -3099,8 +3141,6 @@ xfs_bmap_extents_to_btree( args.fsbno = *firstblock; } args.minlen = args.maxlen = args.prod = 1; - args.total = args.minleft = args.alignment = args.mod = args.isfl = - args.minalignslop = 0; args.wasdel = wasdel; *logflagsp = 0; if ((error = xfs_alloc_vextent(&args))) { @@ -3221,7 +3261,10 @@ xfs_bmap_local_to_extents( xfs_fsblock_t *firstblock, /* first block allocated in xaction */ xfs_extlen_t total, /* total blocks needed by transaction */ int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + int whichfork, + void (*init_fn)(struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)) { int error; /* error return value */ int flags; /* logging flags returned */ @@ -3241,12 +3284,12 @@ xfs_bmap_local_to_extents( xfs_buf_t *bp; /* buffer for extent block */ xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + ASSERT((ifp->if_flags & + (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; args.firstblock = *firstblock; - ASSERT((ifp->if_flags & - (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); /* * Allocate a block. We know we need only one, since the * file currently fits in an inode. @@ -3259,20 +3302,21 @@ xfs_bmap_local_to_extents( args.type = XFS_ALLOCTYPE_NEAR_BNO; } args.total = total; - args.mod = args.minleft = args.alignment = args.wasdel = - args.isfl = args.minalignslop = 0; args.minlen = args.maxlen = args.prod = 1; - if ((error = xfs_alloc_vextent(&args))) + error = xfs_alloc_vextent(&args); + if (error) goto done; - /* - * Can't fail, the space was reserved. - */ + + /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - bp->b_ops = &xfs_bmbt_buf_ops; - memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + + /* initialise the block and copy the data */ + init_fn(bp, ip, ifp); + + /* account for the change in fork size and log everything */ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); @@ -4919,8 +4963,32 @@ xfs_bmapi_write( XFS_STATS_INC(xs_blk_mapw); if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + /* + * XXX (dgc): This assumes we are only called for inodes that + * contain content neutral data in local format. Anything that + * contains caller-specific data in local format that needs + * transformation to move to a block format needs to do the + * conversion to extent format itself. + * + * Directory data forks and attribute forks handle this + * themselves, but with the addition of metadata verifiers every + * data fork in local format now contains caller specific data + * and as such conversion through this function is likely to be + * broken. + * + * The only likely user of this branch is for remote symlinks, + * but we cannot overwrite the data fork contents of the symlink + * (EEXIST occurs higher up the stack) and so it will never go + * from local format to extent format here. Hence I don't think + * this branch is ever executed intentionally and we should + * consider removing it and asserting that xfs_bmapi_write() + * cannot be called directly on local format forks. i.e. callers + * are completely responsible for local to extent format + * conversion, not xfs_bmapi_write(). + */ error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, - &bma.logflags, whichfork); + &bma.logflags, whichfork, + xfs_bmap_local_to_extents_init_fn); if (error) goto error0; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index fbbb9eb92e32..8459b5d8cb71 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -951,8 +951,6 @@ xfs_buf_trylock( locked = down_trylock(&bp->b_sema) == 0; if (locked) XB_SET_OWNER(bp); - else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) - xfs_log_force(bp->b_target->bt_mount, 0); trace_xfs_buf_trylock(bp, _RET_IP_); return locked; @@ -1336,6 +1334,12 @@ _xfs_buf_ioapply( int size; int i; + /* + * Make sure we capture only current IO errors rather than stale errors + * left over from previous use of the buffer (e.g. failed readahead). + */ + bp->b_error = 0; + if (bp->b_flags & XBF_WRITE) { if (bp->b_flags & XBF_SYNCIO) rw = WRITE_SYNC; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3f9949fee391..cf263476d6b4 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -37,109 +37,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } - -#ifdef XFS_TRANS_DEBUG -/* - * This function uses an alternate strategy for tracking the bytes - * that the user requests to be logged. This can then be used - * in conjunction with the bli_orig array in the buf log item to - * catch bugs in our callers' code. - * - * We also double check the bits set in xfs_buf_item_log using a - * simple algorithm to check that every byte is accounted for. - */ -STATIC void -xfs_buf_item_log_debug( - xfs_buf_log_item_t *bip, - uint first, - uint last) -{ - uint x; - uint byte; - uint nbytes; - uint chunk_num; - uint word_num; - uint bit_num; - uint bit_set; - uint *wordp; - - ASSERT(bip->bli_logged != NULL); - byte = first; - nbytes = last - first + 1; - bfset(bip->bli_logged, first, nbytes); - for (x = 0; x < nbytes; x++) { - chunk_num = byte >> XFS_BLF_SHIFT; - word_num = chunk_num >> BIT_TO_WORD_SHIFT; - bit_num = chunk_num & (NBWORD - 1); - wordp = &(bip->__bli_format.blf_data_map[word_num]); - bit_set = *wordp & (1 << bit_num); - ASSERT(bit_set); - byte++; - } -} - -/* - * This function is called when we flush something into a buffer without - * logging it. This happens for things like inodes which are logged - * separately from the buffer. - */ -void -xfs_buf_item_flush_log_debug( - xfs_buf_t *bp, - uint first, - uint last) -{ - xfs_buf_log_item_t *bip = bp->b_fspriv; - uint nbytes; - - if (bip == NULL || (bip->bli_item.li_type != XFS_LI_BUF)) - return; - - ASSERT(bip->bli_logged != NULL); - nbytes = last - first + 1; - bfset(bip->bli_logged, first, nbytes); -} - -/* - * This function is called to verify that our callers have logged - * all the bytes that they changed. - * - * It does this by comparing the original copy of the buffer stored in - * the buf log item's bli_orig array to the current copy of the buffer - * and ensuring that all bytes which mismatch are set in the bli_logged - * array of the buf log item. - */ -STATIC void -xfs_buf_item_log_check( - xfs_buf_log_item_t *bip) -{ - char *orig; - char *buffer; - int x; - xfs_buf_t *bp; - - ASSERT(bip->bli_orig != NULL); - ASSERT(bip->bli_logged != NULL); - - bp = bip->bli_buf; - ASSERT(bp->b_length > 0); - ASSERT(bp->b_addr != NULL); - orig = bip->bli_orig; - buffer = bp->b_addr; - for (x = 0; x < BBTOB(bp->b_length); x++) { - if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { - xfs_emerg(bp->b_mount, - "%s: bip %x buffer %x orig %x index %d", - __func__, bip, bp, orig, x); - ASSERT(0); - } - } -} -#else -#define xfs_buf_item_log_debug(x,y,z) -#define xfs_buf_item_log_check(x) -#endif - STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); /* @@ -429,7 +326,6 @@ xfs_buf_item_format( * Check to make sure everything is consistent. */ trace_xfs_buf_item_format(bip); - xfs_buf_item_log_check(bip); } /* @@ -573,8 +469,18 @@ xfs_buf_item_push( if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; - if (!xfs_buf_trylock(bp)) + if (!xfs_buf_trylock(bp)) { + /* + * If we have just raced with a buffer being pinned and it has + * been marked stale, we could end up stalling until someone else + * issues a log force to unpin the stale buffer. Check for the + * race condition here so xfsaild recognizes the buffer is pinned + * and queues a log force to move it along. + */ + if (xfs_buf_ispinned(bp)) + return XFS_ITEM_PINNED; return XFS_ITEM_LOCKED; + } ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); @@ -923,8 +829,6 @@ xfs_buf_item_log_segment( mask = (1 << end_bit) - 1; *wordp |= mask; } - - xfs_buf_item_log_debug(bip, first, last); } /* diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 16def435944a..ee36c88ecfde 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -98,10 +98,6 @@ typedef struct xfs_buf_log_item { unsigned int bli_flags; /* misc flags */ unsigned int bli_recur; /* lock recursion count */ atomic_t bli_refcount; /* cnt of tp refs */ -#ifdef XFS_TRANS_DEBUG - char *bli_orig; /* original buffer copy */ - char *bli_logged; /* bytes logged (bitmap) */ -#endif int bli_format_count; /* count of headers */ struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ struct xfs_buf_log_format __bli_format; /* embedded in-log header */ @@ -117,16 +113,6 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); -#ifdef XFS_TRANS_DEBUG -void -xfs_buf_item_flush_log_debug( - struct xfs_buf *bp, - uint first, - uint last); -#else -#define xfs_buf_item_flush_log_debug(bp, first, last) -#endif - #endif /* __KERNEL__ */ #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index a8bd26b82ecb..f852b082a084 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -78,14 +78,14 @@ xfs_swapext( goto out_put_tmp_file; } - if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) || - IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) { + if (IS_SWAPFILE(file_inode(f.file)) || + IS_SWAPFILE(file_inode(tmp.file))) { error = XFS_ERROR(EINVAL); goto out_put_tmp_file; } - ip = XFS_I(f.file->f_path.dentry->d_inode); - tip = XFS_I(tmp.file->f_path.dentry->d_inode); + ip = XFS_I(file_inode(f.file)); + tip = XFS_I(file_inode(tmp.file)); if (ip->i_mount != tip->i_mount) { error = XFS_ERROR(EINVAL); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9e1bf5294c91..8025eb23ad72 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -612,15 +612,9 @@ xfs_qm_dqread( if (flags & XFS_QMOPT_DQALLOC) { tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - /* - * Round the chunklen up to the next multiple - * of 128 (buf log item chunk size)). - */ - BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + XFS_QM_DQALLOC_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); if (error) goto error1; cancelflags = XFS_TRANS_RELEASE_LOG_RES; diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index a83611849cee..c585bc646395 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -48,7 +48,7 @@ static int xfs_fileid_length(int fileid_type) case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: return 6; } - return 255; /* invalid */ + return FILEID_INVALID; } STATIC int @@ -90,7 +90,7 @@ xfs_fs_encode_fh( len = xfs_fileid_length(fileid_type); if (*max_len < len) { *max_len = len; - return 255; + return FILEID_INVALID; } *max_len = len; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 67284edb84d7..f03bf1a456fb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -811,7 +811,7 @@ xfs_file_fallocate( loff_t offset, loff_t len) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); long error; loff_t new_size = 0; xfs_flock64_t bf; @@ -912,7 +912,7 @@ xfs_file_readdir( void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); xfs_inode_t *ip = XFS_I(inode); int error; size_t bufsize; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 94eaeedc5498..2866b8c78b7a 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -709,8 +709,8 @@ xfs_fs_log_dummy( int error; tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, + XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return error; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index a815412eab80..515bf71ce01c 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -279,8 +279,6 @@ xfs_ialloc_ag_alloc( (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; - args.mod = args.total = args.wasdel = args.isfl = - args.userdata = args.minalignslop = 0; args.prod = 1; /* @@ -333,8 +331,6 @@ xfs_ialloc_ag_alloc( * Allocate a fixed-size extent of inodes. */ args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.mod = args.total = args.wasdel = args.isfl = - args.userdata = args.minalignslop = 0; args.prod = 1; /* * Allow space for the inode btree to split. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 66282dcb821b..4f201656d2d9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2379,9 +2379,6 @@ xfs_iflush_fork( char *cp; xfs_ifork_t *ifp; xfs_mount_t *mp; -#ifdef XFS_TRANS_DEBUG - int first; -#endif static const short brootflag[2] = { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; static const short dataflag[2] = @@ -2724,9 +2721,6 @@ xfs_iflush_int( xfs_inode_log_item_t *iip; xfs_dinode_t *dip; xfs_mount_t *mp; -#ifdef XFS_TRANS_DEBUG - int first; -#endif ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 22baf6ea4fac..237e7f6f2ab3 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -419,6 +419,7 @@ static inline void xfs_iflock(struct xfs_inode *ip) static inline void xfs_ifunlock(struct xfs_inode *ip) { xfs_iflags_clear(ip, XFS_IFLOCK); + smp_mb(); wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d041d47d9d86..f034bd1652f0 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -269,17 +269,6 @@ xfs_inode_item_format( } else { ASSERT(!(iip->ili_fields & XFS_ILOG_DBROOT)); -#ifdef XFS_TRANS_DEBUG - if (iip->ili_root_size > 0) { - ASSERT(iip->ili_root_size == - ip->i_df.if_broot_bytes); - ASSERT(memcmp(iip->ili_orig_root, - ip->i_df.if_broot, - iip->ili_root_size) == 0); - } else { - ASSERT(ip->i_df.if_broot_bytes == 0); - } -#endif iip->ili_fields &= ~XFS_ILOG_DBROOT; } break; @@ -678,11 +667,6 @@ void xfs_inode_item_destroy( xfs_inode_t *ip) { -#ifdef XFS_TRANS_DEBUG - if (ip->i_itemp->ili_root_size != 0) { - kmem_free(ip->i_itemp->ili_orig_root); - } -#endif kmem_zone_free(xfs_ili_zone, ip->i_itemp); } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 376d4d0b2635..779812fb3d80 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -148,10 +148,6 @@ typedef struct xfs_inode_log_item { data exts */ struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged attr exts */ -#ifdef XFS_TRANS_DEBUG - int ili_root_size; - char *ili_orig_root; -#endif xfs_inode_log_format_t ili_format; /* logged structure */ } xfs_inode_log_item_t; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c1c3ef88a260..d681e34c2950 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -80,7 +80,7 @@ xfs_find_handle( f = fdget(hreq->fd); if (!f.file) return -EBADF; - inode = f.file->f_path.dentry->d_inode; + inode = file_inode(f.file); } else { error = user_lpath((const char __user *)hreq->path, &path); if (error) @@ -168,7 +168,7 @@ xfs_handle_to_dentry( /* * Only allow handle opens under a directory. */ - if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode)) + if (!S_ISDIR(file_inode(parfilp)->i_mode)) return ERR_PTR(-ENOTDIR); if (hlen != sizeof(xfs_handle_t)) @@ -1334,7 +1334,7 @@ xfs_file_ioctl( unsigned int cmd, unsigned long p) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 1244274a5674..63b8fc432151 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -530,7 +530,7 @@ xfs_file_compat_ioctl( unsigned cmd, unsigned long p) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 364818eef40e..5a30dd899d2b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -311,6 +311,62 @@ xfs_iomap_eof_want_preallocate( } /* + * Determine the initial size of the preallocation. We are beyond the current + * EOF here, but we need to take into account whether this is a sparse write or + * an extending write when determining the preallocation size. Hence we need to + * look up the extent that ends at the current write offset and use the result + * to determine the preallocation size. + * + * If the extent is a hole, then preallocation is essentially disabled. + * Otherwise we take the size of the preceeding data extent as the basis for the + * preallocation size. If the size of the extent is greater than half the + * maximum extent length, then use the current offset as the basis. This ensures + * that for large files the preallocation size always extends to MAXEXTLEN + * rather than falling short due to things like stripe unit/width alignment of + * real extents. + */ +STATIC xfs_fsblock_t +xfs_iomap_eof_prealloc_initial_size( + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_off_t offset, + xfs_bmbt_irec_t *imap, + int nimaps) +{ + xfs_fileoff_t start_fsb; + int imaps = 1; + int error; + + ASSERT(nimaps >= imaps); + + /* if we are using a specific prealloc size, return now */ + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + return 0; + + /* + * As we write multiple pages, the offset will always align to the + * start of a page and hence point to a hole at EOF. i.e. if the size is + * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) + * will return FSB 1. Hence if there are blocks in the file, we want to + * point to the block prior to the EOF block and not the hole that maps + * directly at @offset. + */ + start_fsb = XFS_B_TO_FSB(mp, offset); + if (start_fsb) + start_fsb--; + error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); + if (error) + return 0; + + ASSERT(imaps == 1); + if (imap[0].br_startblock == HOLESTARTBLOCK) + return 0; + if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) + return imap[0].br_blockcount; + return XFS_B_TO_FSB(mp, offset); +} + +/* * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size * at a single extent or less if the filesystem is near full. The closer the @@ -319,20 +375,19 @@ xfs_iomap_eof_want_preallocate( STATIC xfs_fsblock_t xfs_iomap_prealloc_size( struct xfs_mount *mp, - struct xfs_inode *ip) + struct xfs_inode *ip, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + int nimaps) { xfs_fsblock_t alloc_blocks = 0; - if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { + alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, + imap, nimaps); + if (alloc_blocks > 0) { int shift = 0; int64_t freesp; - /* - * rounddown_pow_of_two() returns an undefined result - * if we pass in alloc_blocks = 0. Hence the "+ 1" to - * ensure we always pass in a non-zero value. - */ - alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1; alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, rounddown_pow_of_two(alloc_blocks)); @@ -358,7 +413,7 @@ xfs_iomap_prealloc_size( * have a large file on a small filesystem and the above * lowspace thresholds are smaller than MAXEXTLEN. */ - while (alloc_blocks >= freesp) + while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; } @@ -399,7 +454,6 @@ xfs_iomap_write_delay( extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, imap, XFS_WRITE_IMAPS, &prealloc); if (error) @@ -407,7 +461,10 @@ xfs_iomap_write_delay( retry: if (prealloc) { - xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip); + xfs_fsblock_t alloc_blocks; + + alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, + XFS_WRITE_IMAPS); aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); ioalign = XFS_B_TO_FSBT(mp, aligned_offset); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 46bd9d52ab51..eec226f78a40 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -120,7 +120,7 @@ xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, int count, - boolean_t syncing); + bool syncing); STATIC void xlog_verify_tail_lsn( struct xlog *log, @@ -1737,7 +1737,7 @@ xlog_sync( ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); - xlog_verify_iclog(log, iclog, count, B_TRUE); + xlog_verify_iclog(log, iclog, count, true); /* account for log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); @@ -3611,7 +3611,7 @@ xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, int count, - boolean_t syncing) + bool syncing) { xlog_op_header_t *ophead; xlog_in_core_t *icptr; @@ -3659,7 +3659,7 @@ xlog_verify_iclog( /* clientid is only 1 byte */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); - if (syncing == B_FALSE || (field_offset & 0x1ff)) { + if (!syncing || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); @@ -3682,7 +3682,7 @@ xlog_verify_iclog( /* check length */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); - if (syncing == B_FALSE || (field_offset & 0x1ff)) { + if (!syncing || (field_offset & 0x1ff)) { op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((__psint_t)&ophead->oh_len - diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 96fcbb85ff83..d1dba7ce75ae 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1442,9 +1442,8 @@ xlog_recover_find_tid( xlog_tid_t tid) { xlog_recover_t *trans; - struct hlist_node *n; - hlist_for_each_entry(trans, n, head, r_list) { + hlist_for_each_entry(trans, head, r_list) { if (trans->r_log_tid == tid) return trans; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7d6df7c00c36..3806088a8f77 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1109,8 +1109,8 @@ xfs_mount_reset_sbqflags( return 0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); xfs_alert(mp, "%s: Superblock update failed!", __func__); @@ -1583,8 +1583,8 @@ xfs_log_sbcount(xfs_mount_t *mp) return 0; tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, + XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return error; @@ -1945,8 +1945,8 @@ xfs_mount_log_sb( XFS_SB_VERSIONNUM)); tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, + XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bab8314507e4..bc907061d392 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -34,12 +34,19 @@ typedef struct xfs_trans_reservations { uint tr_addafork; /* cvt inode to attributed trans */ uint tr_writeid; /* write setuid/setgid file */ uint tr_attrinval; /* attr fork buffer invalidation */ - uint tr_attrset; /* set/create an attribute */ + uint tr_attrsetm; /* set/create an attribute at mount time */ + uint tr_attrsetrt; /* set/create an attribute at runtime */ uint tr_attrrm; /* remove an attribute */ uint tr_clearagi; /* clear bad agi unlinked ino bucket */ uint tr_growrtalloc; /* grow realtime allocations */ uint tr_growrtzero; /* grow realtime zeroing */ uint tr_growrtfree; /* grow realtime freeing */ + uint tr_qm_sbchange; /* change quota flags */ + uint tr_qm_setqlim; /* adjust quota limits */ + uint tr_qm_dqalloc; /* allocate quota on disk */ + uint tr_qm_quotaoff; /* turn quota off */ + uint tr_qm_equotaoff;/* end of turn quota off */ + uint tr_sb; /* modify superblock */ } xfs_trans_reservations_t; #ifndef __KERNEL__ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 60eff4763156..e5b5cf973781 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1584,10 +1584,9 @@ xfs_qm_write_sb_changes( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - if ((error = xfs_trans_reserve(tp, 0, - mp->m_sb.sb_sectsize + 128, 0, - 0, - XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { xfs_trans_cancel(tp, 0); return error; } diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 6b39115bf145..2d02eac1c9a8 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -146,7 +146,7 @@ xfs_qm_newmount( * inode goes inactive and wants to free blocks, * or via xfs_log_mount_finish. */ - *needquotamount = B_TRUE; + *needquotamount = true; *quotaflags = mp->m_qflags; mp->m_qflags = 0; } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 8a59f8546552..cf9a34051e07 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -408,10 +408,10 @@ xfs_qm_scall_getqstat( { struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_inode *uip, *gip; - boolean_t tempuqip, tempgqip; + bool tempuqip, tempgqip; uip = gip = NULL; - tempuqip = tempgqip = B_FALSE; + tempuqip = tempgqip = false; memset(out, 0, sizeof(fs_quota_stat_t)); out->qs_version = FS_QSTAT_VERSION; @@ -434,12 +434,12 @@ xfs_qm_scall_getqstat( if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &uip) == 0) - tempuqip = B_TRUE; + tempuqip = true; } if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &gip) == 0) - tempgqip = B_TRUE; + tempgqip = true; } if (uip) { out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; @@ -490,8 +490,9 @@ xfs_qm_scall_setqlim( return 0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, - 0, 0, XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { xfs_trans_cancel(tp, 0); return (error); } @@ -638,8 +639,9 @@ xfs_qm_log_quotaoff_end( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); - if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2, - 0, 0, XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_END_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { xfs_trans_cancel(tp, 0); return (error); } @@ -671,14 +673,10 @@ xfs_qm_log_quotaoff( uint oldsbqflag=0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); - if ((error = xfs_trans_reserve(tp, 0, - sizeof(xfs_qoff_logitem_t) * 2 + - mp->m_sb.sb_sectsize + 128, - 0, - 0, - XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) goto error0; - } qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ab8839b26272..ea341cea68cb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -139,9 +139,9 @@ static const match_table_t tokens = { STATIC unsigned long -suffix_strtoul(char *s, char **endp, unsigned int base) +suffix_kstrtoint(char *s, unsigned int base, int *res) { - int last, shift_left_factor = 0; + int last, shift_left_factor = 0, _res; char *value = s; last = strlen(value) - 1; @@ -158,7 +158,10 @@ suffix_strtoul(char *s, char **endp, unsigned int base) value[last] = '\0'; } - return simple_strtoul((const char *)s, endp, base) << shift_left_factor; + if (kstrtoint(s, base, &_res)) + return -EINVAL; + *res = _res << shift_left_factor; + return 0; } /* @@ -174,7 +177,7 @@ xfs_parseargs( char *options) { struct super_block *sb = mp->m_super; - char *this_char, *value, *eov; + char *this_char, *value; int dsunit = 0; int dswidth = 0; int iosize = 0; @@ -230,14 +233,16 @@ xfs_parseargs( this_char); return EINVAL; } - mp->m_logbufs = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &mp->m_logbufs)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } - mp->m_logbsize = suffix_strtoul(value, &eov, 10); + if (suffix_kstrtoint(value, 10, &mp->m_logbsize)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", @@ -266,7 +271,8 @@ xfs_parseargs( this_char); return EINVAL; } - iosize = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &iosize)) + return EINVAL; iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { if (!value || !*value) { @@ -274,7 +280,8 @@ xfs_parseargs( this_char); return EINVAL; } - iosize = suffix_strtoul(value, &eov, 10); + if (suffix_kstrtoint(value, 10, &iosize)) + return EINVAL; iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_GRPID) || !strcmp(this_char, MNTOPT_BSDGROUPS)) { @@ -296,14 +303,16 @@ xfs_parseargs( this_char); return EINVAL; } - dsunit = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &dsunit)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } - dswidth = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &dswidth)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { mp->m_flags |= XFS_MOUNT_SMALL_INUMS; } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { @@ -1552,6 +1561,7 @@ static struct file_system_type xfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 06ed520a767f..2fd7c1ff1d21 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -37,14 +37,45 @@ #include "xfs_extent_busy.h" #include "xfs_bmap.h" #include "xfs_quota.h" +#include "xfs_qm.h" #include "xfs_trans_priv.h" #include "xfs_trans_space.h" #include "xfs_inode_item.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h" #include "xfs_trace.h" kmem_zone_t *xfs_trans_zone; kmem_zone_t *xfs_log_item_desc_zone; +/* + * A buffer has a format structure overhead in the log in addition + * to the data, so we need to take this into account when reserving + * space in a transaction for a buffer. Round the space required up + * to a multiple of 128 bytes so that we don't change the historical + * reservation that has been used for this overhead. + */ +STATIC uint +xfs_buf_log_overhead(void) +{ + return round_up(sizeof(struct xlog_op_header) + + sizeof(struct xfs_buf_log_format), 128); +} + +/* + * Calculate out transaction log reservation per item in bytes. + * + * The nbufs argument is used to indicate the number of items that + * will be changed in a transaction. size is used to tell how many + * bytes should be reserved per item. + */ +STATIC uint +xfs_calc_buf_res( + uint nbufs, + uint size) +{ + return nbufs * (size + xfs_buf_log_overhead()); +} /* * Various log reservation values. @@ -85,18 +116,15 @@ xfs_calc_write_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + - XFS_ALLOCFREE_LOG_COUNT(mp, 2))), - (2 * mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); + MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -117,18 +145,17 @@ xfs_calc_itruncate_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + - 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), - (4 * mp->m_sb.sb_sectsize + - 4 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 4) + - 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) + - 128 * 5 + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); + MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(5, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + mp->m_in_maxlevels, 0))); } /* @@ -148,14 +175,12 @@ xfs_calc_rename_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((4 * mp->m_sb.sb_inodesize + - 2 * XFS_DIROP_LOG_RES(mp) + - 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))), - (3 * mp->m_sb.sb_sectsize + - 3 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 3) + - 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3)))); + MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -175,15 +200,12 @@ xfs_calc_link_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - XFS_DIROP_LOG_RES(mp) + - 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), - (mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); + MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -203,15 +225,12 @@ xfs_calc_remove_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - XFS_DIROP_LOG_RES(mp) + - 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), - (2 * mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); + MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -233,18 +252,18 @@ xfs_calc_symlink_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, 1) + - XFS_DIROP_LOG_RES(mp) + - 1024 + - 128 * (4 + XFS_DIROP_LOG_COUNT(mp))), - (2 * mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + - XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); + MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(1, 1024)), + (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -267,18 +286,19 @@ xfs_calc_create_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + + MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + (uint)XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, 1) + - XFS_DIROP_LOG_RES(mp) + - 128 * (3 + XFS_DIROP_LOG_COUNT(mp))), - (3 * mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + - XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); + xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -306,16 +326,16 @@ xfs_calc_ifree_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) + - 128 * 5 + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)); + xfs_calc_buf_res(1, 0) + + xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + mp->m_in_maxlevels, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -343,9 +363,9 @@ STATIC uint xfs_calc_growdata_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_sectsize * 3 + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); + return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -362,12 +382,12 @@ STATIC uint xfs_calc_growrtalloc_reservation( struct xfs_mount *mp) { - return 2 * mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + - mp->m_sb.sb_inodesize + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)); + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -379,7 +399,7 @@ STATIC uint xfs_calc_growrtzero_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_blocksize + 128; + return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize); } /* @@ -396,11 +416,10 @@ STATIC uint xfs_calc_growrtfree_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_inodesize + - mp->m_sb.sb_blocksize + - mp->m_rsumsize + - 128 * 5; + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(1, mp->m_rsumsize); } /* @@ -411,7 +430,7 @@ STATIC uint xfs_calc_swrite_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_inodesize + 128; + return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize); } /* @@ -421,7 +440,7 @@ xfs_calc_swrite_reservation( STATIC uint xfs_calc_writeid_reservation(xfs_mount_t *mp) { - return mp->m_sb.sb_inodesize + 128; + return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize); } /* @@ -437,13 +456,13 @@ xfs_calc_addafork_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize * 2 + - mp->m_dirblksize + - XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)); + xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, mp->m_dirblksize) + + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -461,35 +480,51 @@ STATIC uint xfs_calc_attrinval_reservation( struct xfs_mount *mp) { - return MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + - 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))), - (4 * mp->m_sb.sb_sectsize + - 4 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 4) + - 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)))); + return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)))); } /* - * Setting an attribute. + * Setting an attribute at mount time. * the inode getting the attribute * the superblock for allocations * the agfs extents are allocated from * the attribute btree * max depth * the inode allocation btree * Since attribute transaction space is dependent on the size of the attribute, - * the calculation is done partially at mount time and partially at runtime. + * the calculation is done partially at mount time and partially at runtime(see + * below). */ STATIC uint -xfs_calc_attrset_reservation( +xfs_calc_attrsetm_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + - 128 * (2 + XFS_DA_NODE_MAXDEPTH); + xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); +} + +/* + * Setting an attribute at runtime, transaction space unit per block. + * the superblock for allocations: sector size + * the inode bmap btree could join or split: max depth * block size + * Since the runtime attribute transaction space is dependent on the total + * blocks needed for the 1st bmap, here we calculate out the space unit for + * one block so that the caller could figure out the total space according + * to the attibute extent length in blocks by: ext * XFS_ATTRSETRT_LOG_RES(mp). + */ +STATIC uint +xfs_calc_attrsetrt_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1)); } /* @@ -508,16 +543,15 @@ xfs_calc_attrrm_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + - 128 * (1 + XFS_DA_NODE_MAXDEPTH + - XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), - (2 * mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); + MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, + XFS_FSB_TO_B(mp, 1)) + + (uint)XFS_FSB_TO_B(mp, + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -527,7 +561,78 @@ STATIC uint xfs_calc_clear_agi_bucket_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_sectsize + 128; + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Clearing the quotaflags in the superblock. + * the super block for changing quota flags: sector size + */ +STATIC uint +xfs_calc_qm_sbchange_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Adjusting quota limits. + * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) + */ +STATIC uint +xfs_calc_qm_setqlim_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); +} + +/* + * Allocating quota on disk if needed. + * the write transaction log space: XFS_WRITE_LOG_RES(mp) + * the unit of quota allocation: one system block size + */ +STATIC uint +xfs_calc_qm_dqalloc_reservation( + struct xfs_mount *mp) +{ + return XFS_WRITE_LOG_RES(mp) + + xfs_calc_buf_res(1, + XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); +} + +/* + * Turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the superblock for the quota flags: sector size + */ +STATIC uint +xfs_calc_qm_quotaoff_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2 + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * End of turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + */ +STATIC uint +xfs_calc_qm_quotaoff_end_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2; +} + +/* + * Syncing the incore super block changes to disk. + * the super block to reflect the changes: sector size + */ +STATIC uint +xfs_calc_sb_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); } /* @@ -555,12 +660,19 @@ xfs_trans_init( resp->tr_writeid = xfs_calc_writeid_reservation(mp); resp->tr_addafork = xfs_calc_addafork_reservation(mp); resp->tr_attrinval = xfs_calc_attrinval_reservation(mp); - resp->tr_attrset = xfs_calc_attrset_reservation(mp); + resp->tr_attrsetm = xfs_calc_attrsetm_reservation(mp); + resp->tr_attrsetrt = xfs_calc_attrsetrt_reservation(mp); resp->tr_attrrm = xfs_calc_attrrm_reservation(mp); resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp); resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp); resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp); resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp); + resp->tr_qm_sbchange = xfs_calc_qm_sbchange_reservation(mp); + resp->tr_qm_setqlim = xfs_calc_qm_setqlim_reservation(mp); + resp->tr_qm_dqalloc = xfs_calc_qm_dqalloc_reservation(mp); + resp->tr_qm_quotaoff = xfs_calc_qm_quotaoff_reservation(mp); + resp->tr_qm_equotaoff = xfs_calc_qm_quotaoff_end_reservation(mp); + resp->tr_sb = xfs_calc_sb_reservation(mp); } /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c6c0601abd7a..cd29f6171021 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -252,17 +252,19 @@ struct xfs_log_item_desc { * as long as SWRITE logs the entire inode core */ #define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) -#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) +#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) #define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) #define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) -#define XFS_ATTRSET_LOG_RES(mp, ext) \ - ((mp)->m_reservations.tr_attrset + \ - (ext * (mp)->m_sb.sb_sectsize) + \ - (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ - (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) -#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) +#define XFS_ATTRSETM_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetm) +#define XFS_ATTRSETRT_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetrt) +#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) #define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) - +#define XFS_QM_SBCHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_qm_sbchange) +#define XFS_QM_SETQLIM_LOG_RES(mp) ((mp)->m_reservations.tr_qm_setqlim) +#define XFS_QM_DQALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_qm_dqalloc) +#define XFS_QM_QUOTAOFF_LOG_RES(mp) ((mp)->m_reservations.tr_qm_quotaoff) +#define XFS_QM_QUOTAOFF_END_LOG_RES(mp) ((mp)->m_reservations.tr_qm_equotaoff) +#define XFS_SB_LOG_RES(mp) ((mp)->m_reservations.tr_sb) /* * Various log count values. diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 6011ee661339..0eda7254305f 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -55,20 +55,6 @@ xfs_ail_check( ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); -#ifdef XFS_TRANS_DEBUG - /* - * Walk the list checking lsn ordering, and that every entry has the - * XFS_LI_IN_AIL flag set. This is really expensive, so only do it - * when specifically debugging the transaction subsystem. - */ - prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); - list_for_each_entry(lip, &ailp->xa_ail, li_ail) { - if (&prev_lip->li_ail != &ailp->xa_ail) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); - ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); - prev_lip = lip; - } -#endif /* XFS_TRANS_DEBUG */ } #else /* !DEBUG */ #define xfs_ail_check(a,l) diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 0c7fa54f309e..642c2d6e1db1 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -516,7 +516,7 @@ xfs_trans_unreserve_and_mod_dquots( int i, j; xfs_dquot_t *dqp; xfs_dqtrx_t *qtrx, *qa; - boolean_t locked; + bool locked; if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; @@ -537,17 +537,17 @@ xfs_trans_unreserve_and_mod_dquots( * about the number of blocks used field, or deltas. * Also we don't bother to zero the fields. */ - locked = B_FALSE; + locked = false; if (qtrx->qt_blk_res) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; dqp->q_res_bcount -= (xfs_qcnt_t)qtrx->qt_blk_res; } if (qtrx->qt_ino_res) { if (!locked) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; } dqp->q_res_icount -= (xfs_qcnt_t)qtrx->qt_ino_res; @@ -556,7 +556,7 @@ xfs_trans_unreserve_and_mod_dquots( if (qtrx->qt_rtblk_res) { if (!locked) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; } dqp->q_res_rtbcount -= (xfs_qcnt_t)qtrx->qt_rtblk_res; diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index d2eee20d5f5b..ac6d567704db 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -33,14 +33,6 @@ #include "xfs_inode_item.h" #include "xfs_trace.h" -#ifdef XFS_TRANS_DEBUG -STATIC void -xfs_trans_inode_broot_debug( - xfs_inode_t *ip); -#else -#define xfs_trans_inode_broot_debug(ip) -#endif - /* * Add a locked inode to the transaction. * @@ -67,8 +59,6 @@ xfs_trans_ijoin( * Get a log_item_desc to point at the new item. */ xfs_trans_add_item(tp, &iip->ili_item); - - xfs_trans_inode_broot_debug(ip); } /* @@ -135,34 +125,3 @@ xfs_trans_log_inode( flags |= ip->i_itemp->ili_last_fields; ip->i_itemp->ili_fields |= flags; } - -#ifdef XFS_TRANS_DEBUG -/* - * Keep track of the state of the inode btree root to make sure we - * log it properly. - */ -STATIC void -xfs_trans_inode_broot_debug( - xfs_inode_t *ip) -{ - xfs_inode_log_item_t *iip; - - ASSERT(ip->i_itemp != NULL); - iip = ip->i_itemp; - if (iip->ili_root_size != 0) { - ASSERT(iip->ili_orig_root != NULL); - kmem_free(iip->ili_orig_root); - iip->ili_root_size = 0; - iip->ili_orig_root = NULL; - } - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - ASSERT((ip->i_df.if_broot != NULL) && - (ip->i_df.if_broot_bytes > 0)); - iip->ili_root_size = ip->i_df.if_broot_bytes; - iip->ili_orig_root = - (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP); - memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot), - iip->ili_root_size); - } -} -#endif diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 7a41874f4c20..61ba1cfa974c 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -32,7 +32,6 @@ typedef unsigned int __uint32_t; typedef signed long long int __int64_t; typedef unsigned long long int __uint64_t; -typedef enum { B_FALSE,B_TRUE } boolean_t; typedef __uint32_t prid_t; /* project ID */ typedef __uint32_t inst_t; /* an instruction */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index d95f565a390e..77ad74834baa 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -725,7 +725,7 @@ xfs_create( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - boolean_t unlock_dp_on_error = B_FALSE; + bool unlock_dp_on_error = false; uint cancel_flags; int committed; prid_t prid; @@ -794,7 +794,7 @@ xfs_create( } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = B_TRUE; + unlock_dp_on_error = true; xfs_bmap_init(&free_list, &first_block); @@ -830,7 +830,7 @@ xfs_create( * error path. */ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = B_FALSE; + unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, &first_block, &free_list, resblks ? @@ -1367,7 +1367,7 @@ xfs_symlink( int pathlen; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - boolean_t unlock_dp_on_error = B_FALSE; + bool unlock_dp_on_error = false; uint cancel_flags; int committed; xfs_fileoff_t first_fsb; @@ -1438,7 +1438,7 @@ xfs_symlink( } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = B_TRUE; + unlock_dp_on_error = true; /* * Check whether the directory allows new symlinks or not. @@ -1484,7 +1484,7 @@ xfs_symlink( * error path. */ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = B_FALSE; + unlock_dp_on_error = false; /* * Also attach the dquot(s) to it, if applicable. |